In [1]:
# Import modules
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Enrichissement des données
from sklearn.impute import SimpleImputer
# KNN
from sklearn.neighbors import KNeighborsClassifier
# Grid search et calcul de scores
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
# Random Forest
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

### Preprocessing

Chargement des données et premiers traitements actés

In [2]:
data = pd.read_csv("data/adult.csv", na_values='?')

In [3]:
def remove_columns(data):
    """Remove useless columns"""
    data.drop('education', axis=1, inplace=True)
    data.drop('relationship', axis=1, inplace=True)
    return

remove_columns(data)

In [4]:
def complete_except_occupation(data):
    """ Fill missing data in some columns with a SimpleImputer"""
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    data['workclass'] = imp.fit_transform(data[['workclass']])
    data['native.country'] = imp.fit_transform(data[['native.country']])
    return

complete_except_occupation(data)

In [5]:
def transform(data):
    """Transform multi categorical data into binary data (to properly apply prediction models)"""
    data['native.country'] = [1 if x == 'United-States' else 0 for x in data['native.country']]
    # data['workclass'] = [1 if x == 'Private' else 0 for x in data['workclass']]
    data['race'] = [1 if x == 'White' else 0 for x in data['race']]
    data['sex'] = [1 if x == 'Male' else 0 for x in data['sex']]
    data['income'] = [0 if x == '<=50K' else 1 for x in data['income']]
    # marital_stat = {'Divorced':'Separated', 'Separated':'Separated', 'Widowed':'Separated',
                    #'Married-AF-spouse':'Married', 'Married-civ-spouse':'Married', 'Married-spouse-absent':'Married',
                    #'Never-married':'Never married'}
    marital_stat = {'Divorced':'Single', 'Separated':'Single', 'Widowed':'Single','Never-married':'Single',
                    'Married-AF-spouse':'Married', 'Married-civ-spouse':'Married', 'Married-spouse-absent':'Married'}
    data['marital.status'] = [marital_stat[x] for x in data['marital.status']]
    workclass = {'Private':'Private',
                 'Self-emp-not-inc':'Self', 'Self-emp-inc':'Self', 'Never-worked':'Self', 'Without-pay':'Self',
                 'Federal-gov':'Public', 'Local-gov':'Public', 'State-gov':'Public'}
    data['workclass'] = [workclass[x] for x in data['workclass']]
    
transform(data)

In [6]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,9,Single,,1,0,0,4356,40,1,0
1,82,Private,132870,9,Single,Exec-managerial,1,0,0,4356,18,1,0
2,66,Private,186061,10,Single,,0,0,0,4356,40,1,0
3,54,Private,140359,4,Single,Machine-op-inspct,1,0,0,3900,40,1,0
4,41,Private,264663,10,Single,Prof-specialty,1,0,0,3900,40,1,0


In [7]:
# Attention : ne pas supprimer le champ 'sex' pour la prédiction de 'occupation' ! 
# Attention : bien supprimer le champ 'fnlwgt' pour la prédiction de 'occupation' ! 

def complete_occupation(data):
    """Fill missing data in 'occupation' column with a KNN prediction """
    df = data.copy()
    df.drop('income', axis=1, inplace=True)
    if 'fnlwgt' in df.columns:
        df.drop('fnlwgt', axis=1, inplace=True)
    # Séparation des sets : train et prédiction
    train = df.dropna()
    prediction = pd.get_dummies(df[df['occupation'].isnull()].drop('occupation', axis=1))
    # Prédiction avec un modèle KNN
    X = pd.get_dummies(train.drop('occupation', axis=1))
    for x in X.columns:
        if x not in prediction.columns:
            prediction[x]=0
    Y = train['occupation']
    knn = KNeighborsClassifier(n_neighbors=23)
    knn.fit(X,Y)
    result = knn.predict(prediction)
    # Inscrire les résultats de la prédiction dans le df 'prediction'
    prediction['occupation']=[x for x in result]
    # Compléter les données manquantes du df initial ('data') grâce aux données prédites ('prediction')
    for index in prediction.index:
        data.loc[index,'occupation'] = prediction.loc[index,'occupation']
    # Calcul du score
    scores_accu = cross_val_score(knn, X, Y, cv=5, scoring='accuracy')
    print("Accuracy du modèle KNN: Moyenne:{}; Ecart-type:{}".format(np.mean(scores_accu),np.std(scores_accu)))
    #scores_f1 = cross_val_score(knn, X, Y, cv=5, scoring='f1')
    #print("F1-score: Moyenne:{}; Ecart-type:{}".format(np.mean(scores_f1),np.std(scores_f1)))
    # RQ: le calcul du F1 score ne marche pas
    return

In [8]:
complete_occupation(data)

Accuracy du modèle KNN: Moyenne:0.28358374272592274; Ecart-type:0.016599860143834518


### Modèle de prédiction Random Forest

In [9]:
# Tests de colonne à supprimer
df_test = data.copy()
#df_test.drop('sex', axis=1, inplace=True)
#df_test.drop('native.country', axis=1, inplace=True)
#df_test.drop('race', axis=1, inplace=True)
# df_test.drop('workclass', axis=1, inplace=True)

In [10]:
X = pd.get_dummies(df_test.drop('income', axis=1))
Y = df_test['income']

In [11]:
len(X.columns)

28

Grid Search

In [None]:
clf_rf = RandomForestClassifier(n_estimators=120, max_features=0.1, n_jobs=-1)

In [None]:
# pour le nombre d'arbres
n_estimators = [i for i in range(20,301,20)]
parameters = {'n_estimators': n_estimators}
clf_gs_rf = GridSearchCV(clf_rf, parameters, cv=5, scoring='accuracy')
clf_gs_rf.fit(X, Y)

In [None]:
# pour max_features
# max_features = [i for i in range(1,len(X.columns)-1)]
max_features = [2, 3, 4]
parameters = {'max_features': max_features}
clf_gs_rf = GridSearchCV(clf_rf, parameters, cv=5, scoring='accuracy')
clf_gs_rf.fit(X, Y)

In [None]:
clf_gs_rf.cv_results_

In [None]:
clf_gs_rf.best_params_

Modèle paramétré

In [12]:
clf_rf = RandomForestClassifier(n_estimators=120, max_features=0.1)
clf_rf.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Importance des features

In [13]:
all_features = pd.DataFrame(data=clf_rf.feature_importances_, index=X.columns, columns=['feature_importance'])

In [14]:
feature_categories = [all_features.index[i].split('_')[0] for i in range(len(X.columns))]
grouped_features = pd.DataFrame(data={'feature_importance':clf_rf.feature_importances_,'category':feature_categories}).groupby('category').sum()
grouped_features.sort_values(by='feature_importance', ascending=False)

Unnamed: 0_level_0,feature_importance
category,Unnamed: 1_level_1
fnlwgt,0.198364
age,0.181624
marital.status,0.14241
education.num,0.109971
capital.gain,0.099931
hours.per.week,0.098241
occupation,0.085902
capital.loss,0.034359
sex,0.01812
workclass,0.015117


Scores

In [15]:
# Accuracy
scores = cross_val_score(clf_rf, X, Y, cv=5, scoring='accuracy')
np.mean(scores), np.std(scores)

(0.8021564883840332, 0.019086340400260512)

In [16]:
# F1-score
scores = cross_val_score(clf_rf, X, Y, cv=5, scoring='f1')
np.mean(scores), np.std(scores)

(0.5293889320838124, 0.08945724696241929)