In [1]:
# Import modules
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Enrichissement des données
from sklearn.impute import SimpleImputer
# KNN
from sklearn.neighbors import KNeighborsClassifier
# Grid search et calcul de scores
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
# Random Forest
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

### Preprocessing

In [2]:
data = pd.read_csv("data/adult.csv", na_values='?')

In [3]:
def remove_columns(data):
    """Remove useless columns"""
    data.drop('education', axis=1, inplace=True)
    data.drop('relationship', axis=1, inplace=True)
    return

remove_columns(data)

In [4]:
def complete_except_occupation(data):
    """ Fill missing data in some columns with a SimpleImputer"""
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    data['workclass'] = imp.fit_transform(data[['workclass']])
    data['native.country'] = imp.fit_transform(data[['native.country']])
    return

complete_except_occupation(data)

In [5]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,9,Widowed,,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,9,Widowed,Exec-managerial,White,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,10,Widowed,,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,4,Divorced,Machine-op-inspct,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,10,Separated,Prof-specialty,White,Female,0,3900,40,United-States,<=50K


In [6]:
def transform(data):
    data['native.country'] = [1 if x == 'United-States' else 0 for x in data['native.country']]
    # data['workclass'] = [1 if x == 'Private' else 0 for x in data['workclass']]
    data['race'] = [1 if x == 'White' else 0 for x in data['race']]
    data['sex'] = [1 if x == 'Male' else 0 for x in data['sex']]
    data['income'] = [0 if x == '<=50K' else 1 for x in data['income']]
    # marital_stat = {'Divorced':'Separated', 'Separated':'Separated', 'Widowed':'Separated',
                    #'Married-AF-spouse':'Married', 'Married-civ-spouse':'Married', 'Married-spouse-absent':'Married',
                    #'Never-married':'Never married'}
    marital_stat = {'Divorced':'Single', 'Separated':'Single', 'Widowed':'Single','Never-married':'Single',
                    'Married-AF-spouse':'Married', 'Married-civ-spouse':'Married', 'Married-spouse-absent':'Married'}
    data['marital.status'] = [marital_stat[x] for x in data['marital.status']]
    workclass = {'Private':'Private',
                 'Self-emp-not-inc':'Self', 'Self-emp-inc':'Self', 'Never-worked':'Self', 'Without-pay':'Self',
                 'Federal-gov':'Public', 'Local-gov':'Public', 'State-gov':'Public'}
    data['workclass'] = [workclass[x] for x in data['workclass']]
    # data = data.sample(frac=1) # cette ligne au milieu de la fonction ne marche pas -> POURQUOI ?
    return

transform(data)

In [7]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,9,Single,,1,0,0,4356,40,1,0
1,82,Private,132870,9,Single,Exec-managerial,1,0,0,4356,18,1,0
2,66,Private,186061,10,Single,,0,0,0,4356,40,1,0
3,54,Private,140359,4,Single,Machine-op-inspct,1,0,0,3900,40,1,0
4,41,Private,264663,10,Single,Prof-specialty,1,0,0,3900,40,1,0


In [8]:
data = data.sample(frac=1)

In [9]:
# Attention : ne pas supprimer le champ 'sex' pour la prédiction de 'occupation' ! 
# Attention : bien supprimer le champ 'fnlwgt' pour la prédiction de 'occupation' ! 

def complete_occupation(data):
    """Fill missing data in 'occupation' column with a KNN prediction """
    df = data.copy()
    df.drop('income', axis=1, inplace=True)
    if 'fnlwgt' in df.columns:
        df.drop('fnlwgt', axis=1, inplace=True)
    # Séparation des sets : train et prédiction
    train = df.dropna()
    prediction = pd.get_dummies(df[df['occupation'].isnull()].drop('occupation', axis=1))
    # Prédiction avec un modèle KNN
    X = pd.get_dummies(train.drop('occupation', axis=1))
    for x in X.columns:
        if x not in prediction.columns:
            prediction[x]=0
    Y = train['occupation']
    knn = KNeighborsClassifier(n_neighbors=23)
    knn.fit(X,Y)
    result = knn.predict(prediction)
    # Inscrire les résultats de la prédiction dans le df 'prediction'
    prediction['occupation']=[x for x in result]
    # Compléter les données manquantes du df initial ('data') grâce aux données prédites ('prediction')
    for index in prediction.index:
        data.loc[index,'occupation'] = prediction.loc[index,'occupation']
    # Calcul du score
    scores_accu = cross_val_score(knn, X, Y, cv=5, scoring='accuracy')
    print("Accuracy du modèle KNN: Moyenne:{}; Ecart-type:{}".format(np.mean(scores_accu),np.std(scores_accu)))
    #scores_f1 = cross_val_score(knn, X, Y, cv=5, scoring='f1')
    #print("F1-score: Moyenne:{}; Ecart-type:{}".format(np.mean(scores_f1),np.std(scores_f1)))
    # RQ: le calcul du F1 score ne marche pas
    return

In [10]:
complete_occupation(data)

Accuracy du modèle KNN: Moyenne:0.2823084617981753; Ecart-type:0.006964328243763412


In [11]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
18261,41,Private,207578,9,Married,Transport-moving,1,1,0,0,40,1,1
10521,40,Private,204046,6,Single,Transport-moving,1,1,0,0,40,1,0
9997,18,Private,128538,10,Single,Other-service,1,0,0,0,6,1,0
3340,36,Self,138940,13,Married,Exec-managerial,1,1,4386,0,50,1,1
23777,27,Private,85126,9,Single,Machine-op-inspct,1,0,0,0,40,1,0


### Modèle de prédiction Random Forest

In [12]:
# Tests de colonne à supprimer
df_test = data.copy()
# Attention à bien mélanger les lignes avant de lancer le modèle
df_test = df_test.sample(frac=1)
#df_test.drop('sex', axis=1, inplace=True)
#df_test.drop('native.country', axis=1, inplace=True)
#df_test.drop('race', axis=1, inplace=True)
# df_test.drop('workclass', axis=1, inplace=True)

In [13]:
X = pd.get_dummies(df_test.drop('income', axis=1))
Y = df_test['income']

In [14]:
len(X.columns)

28

Grid Search

In [15]:
clf_rf = RandomForestClassifier(n_estimators=200, max_features=0.1, n_jobs=-1)

In [None]:
# pour le nombre d'arbres
n_estimators = [i for i in range(20,301,20)]
parameters = {'n_estimators': n_estimators}
clf_gs_rf = GridSearchCV(clf_rf, parameters, cv=5, scoring='accuracy')
clf_gs_rf.fit(X, Y)

In [None]:
clf_gs_rf.cv_results_

In [None]:
clf_gs_rf.best_params_

In [None]:
# pour max_features
max_features = ['auto', 'log2', 0.1, 0.2, 0.5]
parameters = {'max_features': max_features}
clf_gs_rf = GridSearchCV(clf_rf, parameters, cv=5, scoring='accuracy')
clf_gs_rf.fit(X, Y)

In [None]:
clf_gs_rf.cv_results_

In [None]:
clf_gs_rf.best_params_

In [16]:
# pour les 2 à la fois
n_estimators = [i for i in range(20,301,40)]
max_features = ['auto', 'log2', 0.1, 0.5]
parameters = {'n_estimators': n_estimators, 'max_features': max_features}
clf_gs_rf = GridSearchCV(clf_rf, parameters, cv=5, scoring='accuracy')
clf_gs_rf.fit(X, Y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [20, 60, 100, 140, 180, 220, 260, 300], 'max_features': ['auto', 'log2', 0.1, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [17]:
clf_gs_rf.best_params_

{'max_features': 0.5, 'n_estimators': 300}

In [18]:
clf_gs_rf.cv_results_

{'mean_fit_time': array([ 0.82923436,  1.43628125,  2.08847394,  2.96251507,  3.64377804,
         4.48562636,  5.2692874 ,  6.07514687,  0.4806129 ,  1.23585224,
         2.04248047,  2.76392283,  3.42918525,  4.2138042 ,  4.85807838,
         5.65048532,  0.47500663,  1.0780272 ,  1.71565237,  2.39032664,
         3.10176272,  4.15723729,  5.67812414,  7.17821689,  0.95526309,
         3.04757366,  4.23782721,  5.49789381,  7.34161286,  7.79861903,
         9.27130404, 11.32757678]),
 'std_fit_time': array([0.41676695, 0.07626398, 0.00621948, 0.06779784, 0.01419693,
        0.03559776, 0.0311116 , 0.03743702, 0.03492612, 0.00683535,
        0.21294653, 0.02147836, 0.0237594 , 0.04550193, 0.01074067,
        0.01392531, 0.00799792, 0.00712478, 0.02325785, 0.01095422,
        0.15648942, 0.18919778, 0.94837562, 1.14976033, 0.05193031,
        0.39621643, 0.58999394, 0.22796291, 0.81704976, 0.04296285,
        0.30178943, 0.75471548]),
 'mean_score_time': array([0.11164656, 0.15760927, 

Modèle paramétré

In [19]:
clf_rf = RandomForestClassifier(n_estimators=300, max_features=0.5, n_jobs=-1)
clf_rf.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Importance des features

In [20]:
feature_categories = [X.columns[i].split('_')[0] for i in range(len(X.columns))]
grouped_features = pd.DataFrame(data={'feature_importance':clf_rf.feature_importances_,'category':feature_categories}).groupby('category').sum()
grouped_features.sort_values(by='feature_importance', ascending=False)

Unnamed: 0_level_0,feature_importance
category,Unnamed: 1_level_1
fnlwgt,0.192076
marital.status,0.178372
age,0.136332
education.num,0.121466
capital.gain,0.115186
occupation,0.080555
hours.per.week,0.074928
capital.loss,0.037465
workclass,0.029738
sex,0.014007


Scores

In [21]:
# Accuracy
scores = cross_val_score(clf_rf, X, Y, cv=5, scoring='accuracy')
np.mean(scores), np.std(scores)

(0.852154413531659, 0.001591510451724049)

In [22]:
# F1-score
scores = cross_val_score(clf_rf, X, Y, cv=5, scoring='f1')
np.mean(scores), np.std(scores)

(0.6651636847884853, 0.00731482058108925)