## <center><span>Step 1 : DATA PREPARATION</span></center>

In [1]:
# Loading important libraries and modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### <center><span>Step 1.1 : Data Assessment</span></center>

In [2]:
# Reading the dataset
headers_name = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',\
                'marital_status', 'occupation', 'relationship', 'race',\
                'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'salary'
               ]
adult = pd.read_csv("adult.data", header=None, names=headers_name, na_values=' ?')
df_org = adult.copy()

In [3]:
# Information about the datasets columns

adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  31978 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
# Checking for duplicated data

duplicated = adult.duplicated()
adult[duplicated].shape

(24, 15)

In [5]:
# Checking for nan 

adult.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
salary               0
dtype: int64

In [6]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
# Selecting only object columns from the dataset
obj_adult = adult.select_dtypes(include='object')

# Selecting only numeric columns from the dataset
num_adult = adult.select_dtypes(exclude='object')

In [8]:
obj_adult.nunique()

workclass          8
education         16
marital_status     7
occupation        14
relationship       6
race               5
sex                2
native_country    41
salary             2
dtype: int64

In [9]:
obj_adult.workclass.unique()

for col in obj_adult:
    print('------'*10)
    print(f'{col} : {obj_adult[col].unique()}')
    print()

------------------------------------------------------------
workclass : [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 nan ' Self-emp-inc' ' Without-pay' ' Never-worked']

------------------------------------------------------------
education : [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']

------------------------------------------------------------
marital_status : [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']

------------------------------------------------------------
occupation : [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' nan
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv']

------

In [10]:
# Describe numeric data

num_adult.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


### Interprétation

* Dataset shape : 32561 individus over 15 columns or variables.  

* columns Name `fnlwgt` is not exhaustive.  

* 24 duplicated data

* Missing values(nan) : 
    >workclass      : 1836  
    >occupation     : 1843  
    >native_country : 583  

### <center><span>Step 1.2 : Data Cleaning</span></center> 

#### Dealing with duplicated values : Drop all Duplicated Data

In [11]:
# Drop all duplicated data

adult.drop_duplicates(inplace=True)

In [12]:
adult.duplicated().sum()

0

#### Dealing with Missings values(nan) : 

In [13]:
# Listing missings values sum
adult.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     582
salary               0
dtype: int64

In [14]:
adult[adult['occupation'].isna()]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
27,54,,180211,Some-college,10,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,,293936,7th-8th,4,Married-spouse-absent,,Not-in-family,White,Male,0,0,40,,<=50K
69,25,,200681,Some-college,10,Never-married,,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,,212759,10th,6,Married-civ-spouse,,Husband,White,Male,0,0,2,United-States,<=50K
106,17,,304873,10th,6,Never-married,,Own-child,White,Female,34095,0,32,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,,320084,Bachelors,13,Married-civ-spouse,,Wife,White,Female,0,0,55,United-States,>50K
32531,30,,33811,Bachelors,13,Never-married,,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K
32539,71,,287372,Doctorate,16,Married-civ-spouse,,Husband,White,Male,0,0,10,United-States,>50K
32541,41,,202822,HS-grad,9,Separated,,Not-in-family,Black,Female,0,0,32,United-States,<=50K


In [15]:
# Dropped all nan value with dropna() method
adult.dropna(inplace=True)

In [16]:
# Checking the success of the drop
adult.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
salary            0
dtype: int64

#### Handle exhaustiviness of columns name

In [17]:
adult.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30139 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30139 non-null  int64 
 1   workclass       30139 non-null  object
 2   fnlwgt          30139 non-null  int64 
 3   education       30139 non-null  object
 4   education_num   30139 non-null  int64 
 5   marital_status  30139 non-null  object
 6   occupation      30139 non-null  object
 7   relationship    30139 non-null  object
 8   race            30139 non-null  object
 9   sex             30139 non-null  object
 10  capital_gain    30139 non-null  int64 
 11  capital_loss    30139 non-null  int64 
 12  hours_per_week  30139 non-null  int64 
 13  native_country  30139 non-null  object
 14  salary          30139 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [18]:
# Rename column name fnlwgt by final_weight
adult.rename(columns={'fnlwgt':'final_weight'}, inplace=True)

In [19]:
adult.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### <center><span>STEP 2 : FEATURES INGENERING</span></center>

#### Hypotheses : Les personnes ayant un salaire superieur à 50K sont généralement des vieux.

In [20]:
adult.sample(10)

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
13264,38,Private,149347,Some-college,10,Divorced,Sales,Not-in-family,White,Male,0,0,60,United-States,<=50K
11531,45,Local-gov,255559,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
13794,37,Federal-gov,125933,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,>50K
26226,35,Private,509462,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,45,United-States,<=50K
8979,34,Self-emp-not-inc,204375,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,60,United-States,>50K
14820,33,Private,248584,Some-college,10,Married-civ-spouse,Other-service,Husband,White,Male,0,0,50,United-States,<=50K
17585,42,Private,345363,HS-grad,9,Divorced,Exec-managerial,Not-in-family,White,Female,0,0,40,England,<=50K
19243,40,Private,183404,Some-college,10,Separated,Other-service,Unmarried,White,Female,0,0,8,United-States,<=50K
8550,33,Private,159247,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K
25283,56,Private,143266,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K


In [21]:
adult[['capital_gain', 'capital_loss']].nunique()

capital_gain    118
capital_loss     90
dtype: int64

In [22]:
adult.education.unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' 7th-8th', ' Doctorate',
       ' Assoc-voc', ' Prof-school', ' 5th-6th', ' 10th', ' Preschool',
       ' 12th', ' 1st-4th'], dtype=object)

In [23]:
educations = [' Doctorate', ' Bachelors', ' Masters']
high_educ_index = adult[adult.education.isin(educations)].index.to_list()
# adult.loc[high_educ_index]
adult['high_education'] = 0
adult.loc[high_educ_index, 'high_education'] = 1

In [24]:
adult.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary,high_education
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,1


In [25]:
adult_agg = adult.groupby(['native_country','high_education', 'salary']).agg({'hours_per_week':'max'}).reset_index()
adult = pd.merge(adult, adult_agg, how="left", on=['native_country','high_education', 'salary'])

adult.rename(columns={'hours_per_week_x':'hours_per_week', 'hours_per_week_y':'hours_per_week_max'}, inplace=True)

hours_per_week_ratio = adult['hours_per_week'] / adult['hours_per_week_max']

In [26]:
adult.shape

(30139, 17)

In [27]:
adult.insert(17, "hours_per_week_ratio", hours_per_week_ratio)

In [28]:
adult.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary,high_education,hours_per_week_max,hours_per_week_ratio
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,1,99,0.40404
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1,99,0.131313
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,99,0.40404
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,99,0.40404
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,1,52,0.769231


In [29]:
capital = adult.capital_gain - adult.capital_loss

adult.insert(12, 'capital', capital)

adult.drop(columns=['capital_loss', 'capital_gain'], inplace=True)

In [30]:
adult.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital,hours_per_week,native_country,salary,high_education,hours_per_week_max,hours_per_week_ratio
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,40,United-States,<=50K,1,99,0.40404
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,13,United-States,<=50K,1,99,0.131313
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,40,United-States,<=50K,0,99,0.40404
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,40,United-States,<=50K,0,99,0.40404
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,40,Cuba,<=50K,1,52,0.769231


### <center><span>STEP 3 : PREDICTION MODEL</span></center>

In [116]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.utils import shuffle
import smote_variants as sv

In [32]:
features = adult.drop(columns="salary")
obj_features = features.select_dtypes(include='object')

X = pd.get_dummies(obj_features)
y = adult.salary

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

#### Random Forest

In [61]:
param_grid={'n_estimators':[500, 100, 1000, 2000, 3000], 'criterion':['entropy', 'gini'], 'max_depth':[3, 5, 10, 20, 50],\
           'min_samples_leaf':[3, 5, 10, 20, 50], 'random_state':[0]
           }

In [67]:
rf_gscv = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, cv=5)
rf_gscv.fit(X_train, y_train)

In [77]:
y_train.value_counts(normalize=True)

 <=50K    0.752685
 >50K     0.247315
Name: salary, dtype: float64

In [71]:
rf_clf = rf_gscv.best_estimator_
rf_clf.fit(X_train, y_train)
print(rf_clf.score(X_train, y_train))
print(rf_clf.score(X_test, y_test))

0.8251005765003525
0.8223291307232913


In [72]:
y_predict = rf_clf.predict(X_test)
labels = y_test.unique()
f1_score(y_test, y_predict, average=None, labels=labels)

array([0.88762984, 0.57584158])

In [75]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

       <=50K       0.84      0.94      0.89      4485
        >50K       0.74      0.47      0.58      1543

    accuracy                           0.82      6028
   macro avg       0.79      0.71      0.73      6028
weighted avg       0.81      0.82      0.81      6028



In [114]:
print(confusion_matrix(y_test, y_predict))

[[3246 1239]
 [ 223 1320]]


#### Under-Sampling

In [98]:
training_data = pd.concat([X_train, y_train], axis=1)
max_salary_index = training_data.query("salary==' <=50K'").index.to_list()
max_sal_df = training_data.loc[max_salary_index]
min_salary_index = training_data.query("salary==' >50K'").index.to_list()
min_sal_df = training_data.loc[min_salary_index]
len(max_salary_index), len(min_salary_index)

(18148, 5963)

In [102]:
max_sal_sample = max_sal_df.sample(n=len(min_salary_index), random_state=42)

In [106]:
sample_train_data = pd.concat([max_sal_sample, min_sal_df])
sample_train_data = shuffle(sample_train_data)

In [108]:
_X_train = sample_train_data.iloc[:, :-1].values
_y_train = sample_train_data.salary.values

In [110]:
rf_clf_us = rf_gscv.best_estimator_
rf_clf_us.fit(_X_train, _y_train)
print(rf_clf_us.score(_X_train, _y_train))
print(rf_clf_us.score(X_test.values, y_test.values))

0.7824081838001006
0.7574651625746516


In [115]:
y_predict = rf_clf_us.predict(X_test.values)
print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

       <=50K       0.94      0.72      0.82      4485
        >50K       0.52      0.86      0.64      1543

    accuracy                           0.76      6028
   macro avg       0.73      0.79      0.73      6028
weighted avg       0.83      0.76      0.77      6028

[[3246 1239]
 [ 223 1320]]


In [117]:
smote = sv.SMOTE()

In [123]:
_X_train, _y_train = smote.sample(np.array(X_train), np.array(y_train))

2023-02-03 19:15:24,154:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2023-02-03 19:15:24,168:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2023-02-03 19:15:24,170:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2023-02-03 19:15:25,105:INFO:SMOTE: simplex sampling with n_dim 2


In [124]:
pd.Series(_y_train).value_counts()

 <=50K    18148
 >50K     18148
dtype: int64

In [125]:
rf_clf_smote = rf_gscv.best_estimator_
rf_clf_smote.fit(_X_train, _y_train)
print(rf_clf_smote.score(_X_train, _y_train))
print(rf_clf_smote.score(X_test.values, y_test.values))

0.8106953934317831
0.7768745852687459


In [126]:
y_predict = rf_clf_us.predict(X_test.values)
print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

       <=50K       0.93      0.75      0.83      4485
        >50K       0.54      0.84      0.66      1543

    accuracy                           0.78      6028
   macro avg       0.74      0.80      0.75      6028
weighted avg       0.83      0.78      0.79      6028

[[3382 1103]
 [ 242 1301]]


### <center><span>STEP 4 : CONCLUSION AND LIMIT</span></center>