In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Imports and functions

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt
import seaborn as sns

from time import time
from datetime import timedelta

from sklearn.model_selection import train_test_split,  KFold, cross_validate, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, CategoricalNB
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

from sklearn.metrics import confusion_matrix, \
                  classification_report, accuracy_score,  precision_score, recall_score, f1_score

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
sns.set()
pd.set_option('precision', 3)

In [None]:
import warnings
warnings.filterwarnings('ignore')

np.random.seed(123) # for reproducibility

In [None]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

## Reading the file + Preprocessing

In [None]:
data = pd.read_csv("gdrive/My Drive/PROJECT_ML/DATA/data_salary_preprocessed.csv", header=0, delimiter=',', index_col=0)
data.shape

(29991, 15)

In [None]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,hours_per_week,native_country,salary,hours_per_week_interval
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,40,NOT_United-States,0,full_time
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,13,NOT_United-States,0,part_time
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,40,NOT_United-States,0,full_time
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,40,NOT_United-States,0,full_time
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,40,NOT_United-States,0,full_time


In [None]:
data = data.drop(['hours_per_week_interval'], axis=1)
data = data.drop(['education_num'], axis=1)

In [None]:
X = data.loc[:,data.columns != 'salary']
y = data['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)

In [None]:
numeric_features = ['age', 'fnlwgt', 'capital_gain', 'hours_per_week']

In [None]:
def preprocessing(X, y, scaler=None):    
    # We scale the numerical columns
    if scaler is None: 
        # We only want the scaler to fit the train data
        scaler = MinMaxScaler()
        X.loc[:,numeric_features] = scaler.fit_transform(X[numeric_features])
    else: 
        X.loc[:,numeric_features] = scaler.transform(X[numeric_features]) 
    # We apply one-hot-encoding to the categorical columns 
    X = pd.get_dummies(X,drop_first=True)
    return X, y, scaler

X_train, y_train, scaler = preprocessing(X_train,y_train)
X_test, y_test, _ = preprocessing(X_test,y_test,scaler)

In [None]:
X_train.head(n=5)

Unnamed: 0,age,fnlwgt,capital_gain,hours_per_week,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Male
15758,0.096,0.019,0.0,0.469,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
21277,0.055,0.212,0.0,0.347,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,1
15155,0.411,0.072,0.0,0.398,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
24383,0.247,0.154,0.0,0.398,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
11074,0.493,0.053,0.0,0.398,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


### Metrics

- **Accuracy**: This metric is *sensitive to imbalanced data*!!!
- **Precision**: This metric measures how much the model is predicting correctly a class with respect all the predictions of this class. We will use this metric when having false positive predictions is very harmful in our model context.
- **Recall**: This metric measures how much the model is predicting correctly a class with respect all the real values of this class. We will use this metric when having false negative predictions is very harmful in our model context.
- **F1-score**: The harmonic mean of precision and recall. We will use this metric when we want a good balance between precision and recall.

These metrics will give us a precise view of how our model is performing. In this particular problem all the categories are equally important. For this reason we will use the macro average of our metrics instead of focusing on the metrics of one specific class. The macro average means averaging the class-metrics.

We will use the F1-score metric because classes are not balanced. 

## LDA

In [None]:
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)

y_train_lda = lda_model.predict(X_train)

LinearDiscriminantAnalysis()

In [None]:
confusion(y_train, pd.Series(y_train_lda))

predicted,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13960,1207
1,2085,2841


In [None]:

results_train = pd.DataFrame(index=[], columns= ['Accuracy', 'F1 Macro', 'Precision Macro', 'Recall Macro'])

cross_val_results = pd.DataFrame(cross_validate(lda_model , X_train, y_train, cv = 10, 
                            scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_train.loc['LDA',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_train

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.835,0.763,0.785,0.747


## QDA

In [None]:
index= pd.MultiIndex.from_arrays([[], []], names=('model', 'reg'));
results_qda = pd.DataFrame(index=index, columns= ['accuracy','f1_macro', 'precision_macro', 'recall_macro']);

regularization_parameters = [0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1, 5, 10];

for reg in regularization_parameters:
  qda_model = QuadraticDiscriminantAnalysis(reg_param=reg);
  qda_model = qda_model.fit(X_train,y_train);
  y_pred = qda_model.predict(X_train);
  cross_val_results = pd.DataFrame(cross_validate(qda_model , X_train, y_train, cv = 5, 
                            scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))
  results_qda.loc[('QDA',reg),:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values

results_qda.sort_values(by='f1_macro', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1_macro,precision_macro,recall_macro
model,reg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
QDA,0.1,0.773,0.738,0.728,0.794
QDA,0.01,0.744,0.716,0.716,0.787
QDA,0.5,0.817,0.695,0.788,0.669
QDA,0.001,0.704,0.681,0.698,0.767
QDA,0.0001,0.679,0.66,0.688,0.753
QDA,0.0,0.452,0.451,0.634,0.625
QDA,1.0,0.755,0.43,0.377,0.5
QDA,5.0,0.755,0.43,0.377,0.5
QDA,10.0,0.755,0.43,0.377,0.5


In [None]:
qda_model = QuadraticDiscriminantAnalysis(reg_param=0.1); ## with reg param of 0.1
qda_model = qda_model.fit(X_train,y_train);

In [None]:
y_train_qda = qda_model.predict(X_train);
confusion(y_train, pd.Series(y_train_qda));

In [None]:
## with the train data

cross_val_results = pd.DataFrame(cross_validate(qda_model , X_train, y_train, cv = 10, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_train.loc['QDA-0.1',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_train

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.835,0.763,0.785,0.747
QDA-0.01,0.773,0.738,0.728,0.794


## KNN

In [None]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
    estimator=knn,
    param_grid={
        'n_neighbors': range(20,50,2),
        'metric': ['euclidean', 'minkowski', 'manhattan'],
        'weights' : ["uniform", "distance"]
    },
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    refit=False
)

knn_cv.fit(X_train, y_train)
results_cv = pd.DataFrame(knn_cv.cv_results_)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'minkowski', 'manhattan'],
                         'n_neighbors': range(20, 50, 2),
                         'weights': ['uniform', 'distance']},
             refit=False,
             scoring=['accuracy', 'f1_macro', 'precision_macro',
                      'recall_macro'])

In [None]:
cols = ['param_n_neighbors', 'param_metric', 'param_weights', 
     'mean_test_accuracy',
    'mean_test_f1_macro', 'mean_test_precision_macro',
    'mean_test_recall_macro', 
    'std_test_accuracy', 'std_test_f1_macro', 'std_test_precision_macro',
    'std_test_recall_macro'
]

results_cv[cols].sort_values(by='mean_test_f1_macro',ascending=False).head(n=5)

Unnamed: 0,param_n_neighbors,param_metric,param_weights,mean_test_accuracy,mean_test_f1_macro,mean_test_precision_macro,mean_test_recall_macro,std_test_accuracy,std_test_f1_macro,std_test_precision_macro,std_test_recall_macro
59,48,minkowski,distance,0.83,0.757,0.774,0.745,0.002,0.001,0.004,0.003
29,48,euclidean,distance,0.83,0.757,0.774,0.745,0.002,0.001,0.004,0.003
25,44,euclidean,distance,0.829,0.757,0.773,0.745,0.002,0.002,0.003,0.004
55,44,minkowski,distance,0.829,0.757,0.773,0.745,0.002,0.002,0.003,0.004
89,48,manhattan,distance,0.83,0.757,0.774,0.745,0.001,0.002,0.002,0.004


In [None]:
knn = KNeighborsClassifier(n_neighbors=48, metric='minkowski', weights='distance')
knn.fit(X_train, y_train)

cross_val_results = pd.DataFrame(cross_validate(knn , X_train, y_train, cv = 10, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_train=pd.DataFrame(index=[], columns= ['Accuracy', 'F1 Macro', 'Precision Macro', 'Recall Macro'])

results_train.loc['KNN-48',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_train

KNeighborsClassifier(n_neighbors=48, weights='distance')

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
KNN-48,0.828,0.755,0.771,0.744


## Gaussian Naive Bayes

In [None]:
gaussian_nb = GaussianNB()
gaussian_nb = gaussian_nb.fit(X_train,y_train)
y_train_gnb = gaussian_nb.predict(X_train);

cross_val_results = pd.DataFrame(cross_validate(gaussian_nb , X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_train.loc['Gaussian-NB',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_train

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
KNN-48,0.828,0.755,0.771,0.744
Gaussian-NB,0.567,0.562,0.659,0.694


In [None]:
X[numeric_features].head()

Unnamed: 0,age,fnlwgt,capital_gain,hours_per_week
0,39,77516,1,40
1,50,83311,0,13
2,38,215646,0,40
3,53,234721,0,40
4,28,338409,0,40


In [None]:
## only preprocessing numerical variables

numeric_features = ['age', 'fnlwgt', 'capital_gain', 'hours_per_week']

def preprocessing_numerical(X, scaler=None):    
    # We scale the numerical columns
    if scaler is None: 
        # We only want the scaler to fit the train data
        scaler = MinMaxScaler()
        X.loc[:,numeric_features] = scaler.fit_transform(X[numeric_features])
    else: 
        X.loc[:,numeric_features] = scaler.transform(X[numeric_features]) 
    return X, scaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)

X_train_numerical, scaler = preprocessing_numerical(X_train[numeric_features])
X_test_numerical, _ = preprocessing_numerical(X_test[numeric_features],scaler)



In [None]:
# gaussian_nb = GaussianNB()
# gaussian_nb = gaussian_nb.fit(X_train,y_train)
# y_train_gnb = gaussian_nb.predict(X_train);

gaussian_nb_num = GaussianNB()
gaussian_nb_num = gaussian_nb_num.fit(X_train_numerical,y_train)
y_train_gnb_num = gaussian_nb_num.predict(X_train_numerical);

cross_val_results = pd.DataFrame(cross_validate(gaussian_nb_num , X_train_numerical, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_train.loc['Gaussian-NB-only-numerical',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_train

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
KNN-48,0.828,0.755,0.771,0.744
Gaussian-NB,0.567,0.562,0.659,0.694
Gaussian-NB-only-numerical,0.769,0.574,0.688,0.573


## Logistic Regression

#### Logistic regression

In [None]:
logreg = LogisticRegression()
log_cv = GridSearchCV(
    estimator=logreg,
    param_grid={
        'C':np.logspace(-3,3,7),
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'penalty':['none', 'elasticnet', 'l1', 'l2']
    },
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    refit=False
)
log_cv.fit(X_train, y_train)
results_cv = pd.DataFrame(log_cv.cv_results_)

cols = ['param_C', 'param_penalty', 'param_solver',
     'mean_test_accuracy',
    'mean_test_f1_macro', 'mean_test_precision_macro',
    'mean_test_recall_macro', 
    'std_test_accuracy', 'std_test_f1_macro', 'std_test_precision_macro',
    'std_test_recall_macro'
]
results_cv[cols].sort_values(by='mean_test_f1_macro',ascending=False).head(n=5)

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['none', 'elasticnet', 'l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             refit=False,
             scoring=['accuracy', 'f1_macro', 'precision_macro',
                      'recall_macro'])

Unnamed: 0,param_C,param_penalty,param_solver,mean_test_accuracy,mean_test_f1_macro,mean_test_precision_macro,mean_test_recall_macro,std_test_accuracy,std_test_f1_macro,std_test_precision_macro,std_test_recall_macro
97,10.0,l2,liblinear,0.837,0.765,0.788,0.75,0.005,0.008,0.007,0.008
99,10.0,l2,saga,0.837,0.765,0.788,0.75,0.005,0.007,0.007,0.008
95,10.0,l2,newton-cg,0.837,0.765,0.788,0.75,0.005,0.007,0.007,0.008
98,10.0,l2,sag,0.837,0.765,0.788,0.75,0.005,0.007,0.007,0.008
92,10.0,l1,liblinear,0.837,0.765,0.788,0.75,0.005,0.007,0.007,0.008


In [None]:
logreg = LogisticRegression(C=10, penalty='l2', solver='saga', multi_class='multinomial')
cross_val_results = pd.DataFrame(cross_validate(logreg, X_train, y_train, cv = 10, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_train=pd.DataFrame(index=[], columns= ['Accuracy', 'F1 Macro', 'Precision Macro', 'Recall Macro'])

results_train.loc['Logistic Regression',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values

results_train.sort_values(by='F1 Macro', ascending=False)

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
Logistic Regression,0.837,0.764,0.787,0.749
