In [12]:
# Import modules
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Enrichissement des données
from sklearn.impute import SimpleImputer
# KNN
from sklearn.neighbors import KNeighborsClassifier
# Grid search et calcul de scores
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
# Random Forest
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

### Preprocessing

Chargement des données et premiers traitements actés

In [2]:
data = pd.read_csv("data/adult.csv", na_values='?')

In [3]:
# Traitements qui sont actés
def remove_columns(data):
    """Remove useless columns"""
    data.drop('education', axis=1, inplace=True)
    data.drop('fnlwgt', axis=1, inplace=True)
    return

remove_columns(data)

In [4]:
def complete_except_occupation(data):
    """ Fill missing data in some columns with a SimpleImputer"""
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    data['workclass'] = imp.fit_transform(data[['workclass']])
    data['native.country'] = imp.fit_transform(data[['native.country']])
    return

complete_except_occupation(data)

In [5]:
def transform(data):
    """Transform multi categorical data into binary data (to properly apply prediction models)"""
    data['native.country'] = [1 if x == 'United-States' else 0 for x in data['native.country']]
    data['workclass'] = [1 if x == 'Private' else 0 for x in data['workclass']]
    data['race'] = [1 if x == 'White' else 0 for x in data['race']]
    data['sex'] = [1 if x == 'Male' else 0 for x in data['sex']]
    data['income'] = [0 if x == '<=50K' else 1 for x in data['income']]
    
transform(data)

In [6]:
data.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,1,9,Widowed,,Not-in-family,1,0,0,4356,40,1,0
1,82,1,9,Widowed,Exec-managerial,Not-in-family,1,0,0,4356,18,1,0
2,66,1,10,Widowed,,Unmarried,0,0,0,4356,40,1,0
3,54,1,4,Divorced,Machine-op-inspct,Unmarried,1,0,0,3900,40,1,0
4,41,1,10,Separated,Prof-specialty,Own-child,1,0,0,3900,40,1,0


In [7]:
def complete_occupation(data):
    """Fill missing data in 'occupation' column with a KNN prediction """
    df = data.copy()
    df.drop('income', axis=1, inplace=True)
    # Séparation des sets : train et prédiction
    train = df.dropna()
    prediction = pd.get_dummies(df[df['occupation'].isnull()].drop('occupation', axis=1))
    # Prédiction avec un modèle KNN
    X = pd.get_dummies(train.drop('occupation', axis=1))
    Y = train['occupation']
    knn = KNeighborsClassifier(n_neighbors=23)
    knn.fit(X,Y)
    result = knn.predict(prediction)
    # Inscrire les résultats de la prédiction dans le df 'prediction'
    prediction['occupation']=[x for x in result]
    # Compléter les données manquantes du df initial ('data') grâce aux données prédites ('prediction')
    for index in prediction.index:
        data.loc[index,'occupation'] = prediction.loc[index,'occupation']
    # Calcul du score
    scores_accu = cross_val_score(knn, X, Y, cv=5, scoring='accuracy')
    print("Accuracy du modèle KNN: Moyenne:{}; Ecart-type:{}".format(np.mean(scores_accu),np.std(scores_accu)))
    #scores_f1 = cross_val_score(knn, X, Y, cv=5, scoring='f1')
    #print("F1-score: Moyenne:{}; Ecart-type:{}".format(np.mean(scores_f1),np.std(scores_f1)))
    # RQ: le calcul du F1 score ne marche pas
    return

In [8]:
complete_occupation(data)

Accuracy: Moyenne:0.2807528036197878; Ecart-type:0.01565596973427888


In [9]:
data.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,1,9,Widowed,Prof-specialty,Not-in-family,1,0,0,4356,40,1,0
1,82,1,9,Widowed,Exec-managerial,Not-in-family,1,0,0,4356,18,1,0
2,66,1,10,Widowed,Prof-specialty,Unmarried,0,0,0,4356,40,1,0
3,54,1,4,Divorced,Machine-op-inspct,Unmarried,1,0,0,3900,40,1,0
4,41,1,10,Separated,Prof-specialty,Own-child,1,0,0,3900,40,1,0


--> à regarder pour le preprocessing (dans le modèle et dans le KNN occupation)
- workclass : private - public - independent
- marital status et relationship : voir comment on regroupe, lequel on supprime (a priori marié - séparé - never married)
- capital loss ? 


### Modèle de prédiction Random Forest

à regarder pour modifier le modèle : 
- max features (auto, à modifier ?)

In [14]:
X = pd.get_dummies(data.drop('income', axis=1))
Y = data['income']

In [15]:
clf_rf = RandomForestClassifier(max_features='auto') #auto = racine de p
clf_rf.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Analyse de l'importance des features

In [35]:
all_features = pd.DataFrame(data=clf_rf.feature_importances_, index=X.columns, columns=['feature_importance'])

In [58]:
feature_categories = [all_features.index[i].split('_')[0] for i in range(len(X.columns))]
grouped_features = pd.DataFrame(data={'feature_importance':clf_rf.feature_importances_,'category':feature_categories}).groupby('category').sum()
grouped_features.sort_values(by='feature_importance', ascending=False)

Unnamed: 0_level_0,feature_importance
category,Unnamed: 1_level_1
age,0.248783
education.num,0.148206
hours.per.week,0.123603
capital.gain,0.114436
relationship,0.106097
marital.status,0.093345
occupation,0.078109
capital.loss,0.036982
workclass,0.019263
race,0.01274


Grid Search

In [None]:
n_estimators = [i for i in range(20,301,20)]
parameters = {'n_estimators': n_estimators}
clf_gs_rf = GridSearchCV(clf_rf, parameters, cv=5, scoring='accuracy')
clf_gs_rf.fit(X, Y)

In [None]:
clf_gs_rf.cv_results_

In [None]:
clf_rf = RandomForestClassifier(n_estimators=120, max_features='auto')
scores = cross_val_score(clf_rf, X, Y, cv=5, scoring='accuracy')
np.mean(scores), np.std(scores)

In [None]:
scores = cross_val_score(clf_rf, X, Y, cv=5, scoring='f1')
np.mean(scores), np.std(scores)

In [None]:
X.head()