In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import scipy.sparse as sps
from statsmodels.api import Logit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import classification_report, precision_recall_curve, roc_curve, confusion_matrix

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import sklearn

Here the results from 'Adult EDA" file are going to be used

In [2]:
# %run "Adult EDA.ipynb"

In [3]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)
df = df.replace(to_replace= ' ?', value = np.nan)

In [4]:
df = df.drop(['Education-Num'], axis = 'columns')

In [5]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [6]:
# for col in categorical_features_list:
#     print(f'{col}\n{df[col].value_counts()}\n')

As in the whole dataset (X) we have only one 'Holand-Netherlands' value in 'Country' column, we have to process it separately, because in case if it appears in the test set, model will not be able to predict target for such a record. For the initial model, where there are no changes in data, this observation will be removed

In [7]:
print(df.shape)
df_no_nl = df.copy()
df_no_nl.drop(df_no_nl.loc[df['Country']==' Holand-Netherlands'].index, inplace=True)
print(df_no_nl.shape)

(32561, 14)
(32560, 14)


In [8]:
X = df_no_nl.drop(['Income'], axis = 'columns')
y = df_no_nl['Income']

In [9]:
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.2)

## 1. Features preprocessing

First, all variables have to be transformed to numerical format to feed them to LogisticRegression function:

In [10]:
X_train = X.copy()
y_train = y.copy()

data_train = pd.merge(left=y_train, right=X_train, left_index=True, right_index=True)
data_train.shape

(26048, 14)

In [11]:
data_train.head()

Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
28566,<=50K,31,Private,246439,Some-college,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States
19230,<=50K,35,,98989,9th,Divorced,,Own-child,Amer-Indian-Eskimo,Male,0,0,38,United-States
5283,<=50K,40,Private,167265,Masters,Divorced,Prof-specialty,Not-in-family,White,Male,0,0,43,United-States
24642,<=50K,19,Private,181572,Some-college,Never-married,Adm-clerical,Own-child,White,Male,0,0,40,United-States
8644,<=50K,42,Private,175133,Some-college,Never-married,Machine-op-inspct,Unmarried,Black,Female,0,0,40,United-States


In this dataset we have only one feature, where the order matters - Education, so it will be transformed with using OrdinalEncoder. For all the rest of categorical features the order does not matter, hense we can apply OneHotEncoder() to them.

# 1st model 
### Inital model without changes in data

In [12]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [13]:
# A function, that performs all needed data preparation and feeds it to LogisticRegression

def get_LR_performance(data, numerical_features_list: list, categorical_features_list:list):
    """The function performs data preprocessing, creates pipeline with LogisticRegression model, 
        and prints it's performance out

    Args:
        data: full dataset with features and target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder
    """

    X = data.drop(columns=['Income'])
    y = data["Income"]

    columntransformer = ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
                                make_column_selector(pattern = 'Education')),
    ('stand scaler', StandardScaler(), numerical_features_list),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='drop')
    
    pipe = make_pipeline(columntransformer, LogisticRegression(max_iter=10000)).fit(X, y)

    y_pred = pipe.predict(X)
    
    scores = cross_val_score(pipe, X, y, cv=5, scoring='f1_macro')
    f1_mean_score = round(np.mean(scores),2)
    f1_std = round(np.std(scores),2)
    
    report = classification_report(y, y_pred, target_names=data['Income'].unique())
   
    print(f'f1 score: mean = {f1_mean_score} | std = {f1_std}')
    print(report)


In [14]:
get_LR_performance(data_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91     19754
        >50K       0.74      0.61      0.67      6294

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.79     26048
weighted avg       0.85      0.85      0.85     26048



### Let's now understand significance of features with the help of Logit() function from statsmodel

In [16]:
def logit_summary(data, numerical_features: list, categorical_features: list):
    """Function performs data preprocessing and applies Logit() function. After that retuns summary which contains featues significances

    Args:
        X (Series object): X_train DataFrame of features
        y (array): y_train - target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder

    Returns:
        Summary: summary of statsmodel Logit() model with the help of which the decision about 
                keeping or modifying/removing a feature can be made
    """

    X = data.drop(columns=['Income'])
    y = data["Income"]

    column_transformer = ColumnTransformer(transformers = [
        ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                          ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                          ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
         make_column_selector(pattern = 'Education')),
        ('stand_scaler', StandardScaler(), numerical_features),
        ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features)],
        remainder='drop')
    
    X_trans = column_transformer.fit_transform(X)
    
    if sps.issparse(X_trans):
        X_trans = X_trans.toarray()
        
    x_columns_names = column_transformer.get_feature_names_out()
    X_trans = pd.DataFrame(X_trans, columns = x_columns_names)
    
    y_train_df = pd.DataFrame(y)
    onehot = OneHotEncoder(dtype='int', drop='first')
    y_trans = onehot.fit_transform(y_train_df)
    y_column_name = onehot.get_feature_names_out()
    y_trans = pd.DataFrame.sparse.from_spmatrix(y_trans, columns=y_column_name)
    
    model = Logit(y_trans, X_trans).fit_regularized()
    summary = model.summary()
    
    return summary

In [17]:
summary = logit_summary(data_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.31675231516667335
            Iterations: 721
            Function evaluations: 724
            Gradient evaluations: 721


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25964.0
Method:,MLE,Df Model:,83.0
Date:,"Sat, 22 Apr 2023",Pseudo R-squ.:,0.4272
Time:,17:07:22,Log-Likelihood:,-8250.8
converged:,True,LL-Null:,-14403.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2819,0.008,35.209,0.000,0.266,0.298
stand_scaler__Age,0.3457,0.024,14.246,0.000,0.298,0.393
stand_scaler__final weight,0.0743,0.020,3.668,0.000,0.035,0.114
stand_scaler__Capital Gain,2.3423,0.086,27.361,0.000,2.175,2.510
stand_scaler__Capital Loss,0.2573,0.017,15.446,0.000,0.225,0.290
stand_scaler__Hours per week,0.3716,0.022,16.607,0.000,0.328,0.415
onehot__Workclass_ Local-gov,-0.7148,0.122,-5.868,0.000,-0.953,-0.476
onehot__Workclass_ Never-worked,-2.3906,,,,,
onehot__Workclass_ Private,-0.5591,0.100,-5.609,0.000,-0.754,-0.364


# 2nd model
### Same model, but without 'final weight'

As we remember from EDA, **'final weight'** feature did not pass the significance border. Let's try to remove it and check the performance

In [None]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [None]:
get_LR_performance(data_train, numerical_features_list, categorical_features_list)

#### Performance in generel has not changed, let's check if features' significances have changed

In [None]:
summary = logit_summary(data_train, numerical_features_list, categorical_features_list)
summary

According to Logit() results, all of numerical features are statistically significant. Some categoties in a couple of categotical features have to be clustered as they are insignificant. 

Assumption 1. Workclasses representatives, that do not work or work without pay will have less than 50k, so can become one cluster.

Assumption 2. Single people tend to earn more, as they have more free time for career development; so values of Marital Status feature can be clustered to Sigle and Married 

Assumption 3. Occupation has no impact on Income, as all categories are insignificant, so could be removed from the model. But before, they will be left like this, as from the EDA we saw that this feature is significant

Assumption 4. All categories of Relationship, Ethnic Group and Sex features are significant.

Assumption 5. Most of countries have no impact on target, it's possible to cluster them to developed and developing. 

# 3rd model
### Clustering categories of features


In [18]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [19]:
data_clustered = data_train.copy()

def cluster_categorical(data):
    data['Workclass'] = data['Workclass'].replace({' Never-worked': 'Without-pay'})

    # cluster Marital status
    data.loc[
        lambda x: x["Marital Status"].isin([' Widowed', ' Separated', ' Married-spouse-absent', ' Never-married', ' Divorced']), "Marital Status"
    ] = "Single"

    data.loc[
        lambda x: x["Marital Status"].isin([' Married-AF-spouse', ' Married-civ-spouse']), "Marital Status"
    ] = "Married"

    # cluster Countries
    data.loc[
        lambda x: x["Country"].isin([' Holand-Netherlands', ' Scotland', ' Italy', ' England', ' Ireland', ' Germany', ' Hong',  ' France', ' Taiwan', 
                                    ' Japan', ' Puerto-Rico', ' Canada', ' United-States']), "Country"
    ] = "Developed"

    data.loc[
        lambda x: x["Country"].isin([' Hungary', ' Greece', ' Portugal', ' Poland', ' Yugoslavia', ' Cambodia', ' Iran',  ' Philippines', ' Laos', ' Thailand', ' Vietnam', ' South', 
                                    ' China', ' India', ' Honduras', ' Outlying-US(Guam-USVI-etc)', ' Trinadad&Tobago', ' Ecuador',  ' Philippines', ' Nicaragua',
                                    ' Peru', ' Haiti', ' Columbia', ' Guatemala', ' Dominican-Republic', ' Jamaica',  ' Cuba', ' El-Salvador', ' Mexico']), "Country"
    ] = "Developing"

cluster_categorical(data_clustered)

In [20]:
data_clustered.head()

Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
28566,<=50K,31,Private,246439,Some-college,Married,Craft-repair,Husband,White,Male,0,0,45,Developed
19230,<=50K,35,,98989,9th,Single,,Own-child,Amer-Indian-Eskimo,Male,0,0,38,Developed
5283,<=50K,40,Private,167265,Masters,Single,Prof-specialty,Not-in-family,White,Male,0,0,43,Developed
24642,<=50K,19,Private,181572,Some-college,Single,Adm-clerical,Own-child,White,Male,0,0,40,Developed
8644,<=50K,42,Private,175133,Some-college,Single,Machine-op-inspct,Unmarried,Black,Female,0,0,40,Developed


Let's now apply the pipeline to updated dataset

In [21]:
get_LR_performance(data_clustered, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91     19754
        >50K       0.74      0.60      0.66      6294

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.78     26048
weighted avg       0.85      0.85      0.85     26048



In [22]:
summary = logit_summary(data_clustered, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3223430914441633
            Iterations: 341
            Function evaluations: 344
            Gradient evaluations: 341


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26007.0
Method:,MLE,Df Model:,40.0
Date:,"Sat, 22 Apr 2023",Pseudo R-squ.:,0.4171
Time:,17:08:29,Log-Likelihood:,-8396.4
converged:,True,LL-Null:,-14403.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2316,0.009,24.830,0.000,0.213,0.250
stand_scaler__Age,0.3593,0.024,15.229,0.000,0.313,0.406
stand_scaler__final weight,0.0880,0.020,4.424,0.000,0.049,0.127
stand_scaler__Capital Gain,2.3719,0.086,27.638,0.000,2.204,2.540
stand_scaler__Capital Loss,0.2604,0.017,15.748,0.000,0.228,0.293
stand_scaler__Hours per week,0.3776,0.022,17.054,0.000,0.334,0.421
onehot__Workclass_ Local-gov,-1.1778,0.116,-10.110,0.000,-1.406,-0.949
onehot__Workclass_ Private,-1.0207,0.094,-10.843,0.000,-1.205,-0.836
onehot__Workclass_ Self-emp-inc,-0.8301,0.130,-6.375,0.000,-1.085,-0.575


1. Workclass 'Without pay' is still innsignificant, will try to remove these instances (there is a small amount of them)
2. Relationship 'Unmarried' and 'Not-in-family' are both insignificant
3. 

Country is still sighniffacant, but all categories of Workclass and Occupation variables are insignifficant. Let's try to apply 'label encoder' to them instead of 'one hot encoder'
# 4th model

In [None]:
categorical_features_list = ['Marital Status','Relationship', 'Ethnic group', 'Country', 'Sex']
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Occupation', 'Workclass']

In [25]:
data_train

Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
28566,<=50K,31,Private,246439,Some-college,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States
19230,<=50K,35,,98989,9th,Divorced,,Own-child,Amer-Indian-Eskimo,Male,0,0,38,United-States
5283,<=50K,40,Private,167265,Masters,Divorced,Prof-specialty,Not-in-family,White,Male,0,0,43,United-States
24642,<=50K,19,Private,181572,Some-college,Never-married,Adm-clerical,Own-child,White,Male,0,0,40,United-States
8644,<=50K,42,Private,175133,Some-college,Never-married,Machine-op-inspct,Unmarried,Black,Female,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10684,<=50K,64,,239529,11th,Widowed,,Not-in-family,White,Female,3674,0,35,United-States
17827,<=50K,25,Private,114838,Prof-school,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,8,Italy
31826,>50K,36,Self-emp-inc,251730,Some-college,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,60,United-States
6071,<=50K,57,Self-emp-not-inc,286836,10th,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,8,United-States


In [28]:
data_clustered = data_train.copy()

le  = LabelEncoder()

data_clustered['Workclass'] = le.fit_transform(data_clustered['Workclass'])
data_clustered['Occupation'] = le.fit_transform(data_clustered['Occupation'])

cluster_categorical(data_clustered)

In [29]:
data_clustered

Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
28566,<=50K,31,Private,246439,Some-college,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States
19230,<=50K,35,,98989,9th,Divorced,,Own-child,Amer-Indian-Eskimo,Male,0,0,38,United-States
5283,<=50K,40,Private,167265,Masters,Divorced,Prof-specialty,Not-in-family,White,Male,0,0,43,United-States
24642,<=50K,19,Private,181572,Some-college,Never-married,Adm-clerical,Own-child,White,Male,0,0,40,United-States
8644,<=50K,42,Private,175133,Some-college,Never-married,Machine-op-inspct,Unmarried,Black,Female,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10684,<=50K,64,,239529,11th,Widowed,,Not-in-family,White,Female,3674,0,35,United-States
17827,<=50K,25,Private,114838,Prof-school,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,8,Italy
31826,>50K,36,Self-emp-inc,251730,Some-college,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,60,United-States
6071,<=50K,57,Self-emp-not-inc,286836,10th,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,8,United-States


In [None]:
get_LR_performance(data_clustered, numerical_features_list, categorical_features_list)

In [None]:
summary = logit_summary(X_clustered, y_train, numerical_features_list, categorical_features_list)
summary

All features are now significant, but the performance of model in total and especially for the minority class is now worse.
# 6th model

Let's try to remove missing data

In [None]:
X_no_missing_values = X_train.copy()
y_no_missing_values = y_train.copy()
df_no_missing_values = pd.concat([X_no_missing_values, y_no_missing_values], axis = 1)
df_no_missing_values = df_no_missing_values[df_no_missing_values['Workclass'] != ' ?']
df_no_missing_values = df_no_missing_values[df_no_missing_values['Occupation'] != ' ?']
df_no_missing_values = df_no_missing_values[df_no_missing_values['Country'] != ' ?']
X_no_missing_values = df_no_missing_values.drop(['Income'], axis=1)
y_no_missing_values = df_no_missing_values['Income']

In [None]:
df_no_missing_values

In [None]:
print(X_no_missing_values.shape, y_no_missing_values.shape)
print(X_train.shape, y_train.shape)

In [None]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [None]:
X_no_missing_values = cluster_categorical(X_no_missing_values)

In [None]:
get_LR_performance(X_no_missing_values, y_no_missing_values, numerical_features_list, categorical_features_list)

This approach to data preprocessing gave us the best result so far - performance is just a bit better than we had for initial model. It's still not a good model though
# 7th model
### Let's try to apply ln() function to 'Age', 'Capital Gain' and 'Capital Loss' festures (as they are heavy tailed) before Standard Scaler to normalize it

In [None]:
X_logged = X_train.copy()

In [None]:
X_logged['Capital Gain'] = np.log(1+ X_logged['Capital Gain'])
X_logged['Capital Loss'] = np.log(1+ X_logged['Capital Loss'])
X_logged['Age'] = np.log(X_logged['Age'])

X_logged = cluster_categorical(X_logged)

In [None]:
get_LR_performance(X_logged, y_train, numerical_features_list, categorical_features_list)

Not better either.
# 8th model
### Another try is to cluster 'Hours per week' feature to part-time, fulltime and overtime workers with fulltime value for 40 hours

In [None]:
X_new = X_train.copy()
X_new['Hours per week'] = np.where(X_new['Hours per week'] == 40, 'fulltime', 
                                   (np.where(X_new['Hours per week'] < 40, 'part-time', 'overtime')))

X_new['Capital Gain'] = np.log(1+ X_new['Capital Gain'])
X_new['Capital Loss'] = np.log(1+ X_new['Capital Loss'])
X_new['Age'] = np.log(X_new['Age'])

X_new = cluster_categorical(X_new)

X_new = X_new.drop(['final weight'], axis='columns')

In [None]:
X_new.head()

In [None]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country', 'Hours per week']

In [None]:
columntransformer = ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
     make_column_selector(pattern = 'Education')),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='passthrough')
    
pipe = make_pipeline(columntransformer, LogisticRegression(max_iter=1000)).fit(X_new, y_train)

y_pred = pipe.predict(X_new)

scores = cross_val_score(pipe, X_new, y_train, cv=5, scoring='f1_macro')

f1_mean_score = round(np.mean(scores),2)
f1_std = round(np.std(scores),2)

report = classification_report(y_train, y_pred, target_names=y_train.unique())
print(f'f1 score: mean = {f1_mean_score} | std = {f1_std}\n{report}')

# 9th model
### Lets now try to cluster all minority categories of imbalanced features together

In [None]:
X_cluster2 = X_train.copy()
def balance_predictors(X):
    X['Ethnic group'] = np.where(X['Ethnic group'] != ' White', 'Other', X['Ethnic group'])
    X['Country'] = np.where(X['Country'] != ' United-States', 'Other', X['Country'])
    X['Workclass'] = np.where(X['Workclass'] != ' Private', 'Other', X['Workclass'])
    X['Marital Status'] = np.where(((X['Marital Status'] == ' Widowed') |
                                    (X['Marital Status'] == ' Married-spouse-absent') |
                                    (X['Marital Status'] == ' Separated')), 
                                    'Other', X_train['Marital Status'])
    X['Occupation'] = np.where(((X['Occupation'] == ' Adm-clerical') |
                                (X['Occupation'] == ' Armed-Forces') |
                                (X['Occupation'] == ' Craft-repair') |
                                (X['Occupation'] == ' Machine-op-inspct') |
                                (X['Occupation'] == ' Priv-house-serv') |
                                (X['Occupation'] == ' Transport-moving')), 
                                'Other', X['Occupation'])
    X['Hours per week'] = np.where(X['Hours per week'] == 40, 'fulltime', (np.where(X['Hours per week'] < 40, 'part-time', 'overtime')))
    
balance_predictors(X_cluster2)
X_cluster2.sample(3)

In [None]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country', 'Hours per week']

In [None]:
get_LR_performance(X_cluster2, y_train, numerical_features_list, categorical_features_list)

# Now let's try to find the threshold for optimal recall and precisoin values as the dataset is imbalanced and predicts minority class much worse so far

In [None]:
X_10 = X_train.copy()
y_10 = y_train.copy()
X_10.sample()

In [None]:
# preform data transformation as we used for initial model

categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']

cluster_categorical(X_10)

column_transformer = ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                        ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                        ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
        make_column_selector(pattern = 'Education')),
    ('minmax_scaler', MinMaxScaler(), numerical_features_list),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='drop')

X_10 = column_transformer.fit_transform(X_10)

if sps.issparse(X_10):
    X_10 = X_10.toarray()
    
x_columns_names = column_transformer.get_feature_names_out()
X_10 = pd.DataFrame(X_10, columns = x_columns_names)

In [None]:
X_10['ordinal__Education'] = MinMaxScaler().fit_transform(X_10[['ordinal__Education']])
X_10.sample()

In [None]:
y_10 = y_10.replace({' >50K': 1, ' <=50K': 0})
y_10.sample(5)

In [None]:
# Apply Stochastic Gradient Descent to find global optimum  of the cost function

sgd_clf = SGDClassifier(loss = 'modified_huber')
sgd_clf.fit(X_10, y_10)

In [None]:
print(sgd_clf.predict([X_10.iloc[2]]), y_10.iloc[2])

In [None]:
cross_val_score(sgd_clf, X_10, y_10, cv=5, scoring="f1_macro")

In [None]:
y_10_pred_sgd = cross_val_predict(sgd_clf, X_10, y_10, cv=3)
confusion_matrix(y_10, y_10_pred_sgd)

In [None]:
print(classification_report(y_10, y_10_pred_sgd))

In [None]:
# transform test data to check

cluster_categorical(X_test)

column_transformer1 = ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                        ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                        ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
        make_column_selector(pattern = 'Education')),
    ('minmax_scaler', MinMaxScaler(), numerical_features_list),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='drop')

X_test = column_transformer1.fit_transform(X_test)

if sps.issparse(X_test):
    X_test = X_test.toarray()
    
x_columns_names = column_transformer1.get_feature_names_out()
X_test = pd.DataFrame(X_test, columns = x_columns_names)

y_test = y_test.replace({' >50K': 1, ' <=50K': 0})
y_test.sample(5)

In [None]:
y_test_pred_sgd = cross_val_predict(sgd_clf, X_test, y_test, cv=3)
print(classification_report(y_test, y_test_pred_sgd))

More than 3k of false negatives (minority class in this case), which is pretty bad

In [None]:
y_10_scores_sgd = sgd_clf.decision_function([X_10.iloc[90]])
y_10_scores_sgd

In [None]:
y_10_scores_sgd = cross_val_predict(sgd_clf, X_10, y_10, cv=3, method="decision_function")
precisions_sgd, recalls_sgd, thresholds_sgd = precision_recall_curve(y_10, y_10_scores_sgd)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): 
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision") 
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall") 
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])


plot_precision_recall_vs_threshold(precisions_sgd, recalls_sgd, thresholds_sgd)
plt.show()

In [None]:
def plot_roc_curve(fpr, tpr, label=None): 
    plt.plot(fpr, tpr, linewidth=2, label=label) 
    plt.plot([0, 1], [0, 1], 'k--') 
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

fpr_sgd, tpr_sgd, thresholds_sgd = roc_curve(y_10, y_10_scores_sgd)
plot_roc_curve(fpr_sgd, tpr_sgd)
plt.show()

## Let's now compare it to Logistic Regression 

In [None]:
lg_clf = LogisticRegression(max_iter=500, C = 100)
lg_clf.fit(X_10, y_10)

In [None]:
print(lg_clf.predict([X_10.iloc[2]]), y_10.iloc[2])

In [None]:
cross_val_score(lg_clf, X_10, y_10, cv=3, scoring="f1_macro")

In [None]:
y_10_pred_lg = cross_val_predict(lg_clf, X_10, y_10, cv=3)
confusion_matrix(y_10, y_10_pred_lg)

In [None]:
print(classification_report(y_10, y_10_pred_lg))

In [None]:
y_10_scores_lg = lg_clf.decision_function([X_10.iloc[90]])
y_10_scores_lg

In [None]:
y_10_scores_lg = cross_val_predict(lg_clf, X_10, y_10, cv=3, method="decision_function")
precisions_lg, recalls_lg, thresholds_lg = precision_recall_curve(y_10, y_10_scores_lg)

In [None]:
plot_precision_recall_vs_threshold(precisions_lg, recalls_lg, thresholds_lg)
plt.show()

In [None]:
fpr_lg, tpr_lg, thresholds_lg = roc_curve(y_10, y_10_scores_lg)

In [None]:
plot_roc_curve(fpr_lg, tpr_lg)
plt.show()

# KNN classifier

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_10, y_10)

In [None]:
y_10_pred_knn = cross_val_predict(knn_clf, X_10, y_10, cv=3)


In [None]:
confusion_matrix(y_10, y_10_pred_knn)

# Support Vector Machine classifier with polynomial kernel

In [None]:
poly_kernel_svm_clf = SVC(C = 10, gamma = 0.1, kernel = 'rbf')

In [None]:
poly_kernel_svm_clf.fit(X_10, y_10)

In [None]:
y_10_pred_svm = cross_val_predict(poly_kernel_svm_clf, X_10, y_10, cv=3)

In [None]:
confusion_matrix(y_10, y_10_pred_svm)

In [None]:
print(classification_report(y_10, y_10_pred_svm))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'kernel': ['rbf'], 'gamma': [1, 0.1, 0.01], 'C': [1, 10, 100]},
    {'kernel': ['poly'], 'degree': [7, 14, 28], 'coef0': [0.1, 1,10], 'C': [1, 10, 100]},
]
svm = SVC()
grid_search = GridSearchCV(svm, param_grid, cv=5, refit = True)
grid_search.fit(X_10, y_10)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
pca = PCA(n_components=37)
X2D = pca.fit_transform(X_10)

In [None]:
y_10_pred_pca = cross_val_predict(LogisticRegression(), X2D, y_10, cv=3)

In [None]:
print(classification_report(y_10, y_10_pred_pca))

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
X_10

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import Pipeline

from sklearn.decomposition import KernelPCA

clf = Pipeline([
        ("kpca", KernelPCA(n_components=2)),
        ("log_reg", LogisticRegression())
])
param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf", "sigmoid"]
        }]
grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X_10, y_10)

print(grid_search.best_params_)