In [1]:
import pandas as pd
import numpy as np
from common_functions import cluster_categorical, preprocess_data
from statsmodels.api import Logit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

Here the results from 'Adult EDA" file are going to be used

In [2]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)
df = df.replace(to_replace= ' ?', value = np.nan)

In [3]:
df = df.drop(['Education-Num'], axis = 'columns')

In [4]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [5]:
for col in categorical_features_list:
    print(f'{col}\n{df[col].value_counts()}\n')

Workclass
Workclass
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: count, dtype: int64

Marital Status
Marital Status
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: count, dtype: int64

Occupation
Occupation
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: count, dtype: int64

Relationship
Relationship
 Hu

As in the whole dataset (X) we have only one 'Holand-Netherlands' value in 'Country' column, we have to process it separately, because in case if it appears in the test set, model will not be able to predict target for such a record. For the initial model, where there are no changes in data, this observation will be removed

In [6]:
print(df.shape)
df_no_nl = df.copy()
df_no_nl.drop(df_no_nl.loc[df['Country']==' Holand-Netherlands'].index, inplace=True)
print(df_no_nl.shape)

(32561, 14)
(32560, 14)


In [7]:
data_train, data_test = train_test_split(df_no_nl, test_size = 0.2)
print(data_train.shape)
data_train.head()

(26048, 14)


Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
451,45,Self-emp-inc,197332,Some-college,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,55,United-States,>50K
29927,42,Private,223548,7th-8th,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,30,Mexico,<=50K
23423,30,Private,19302,Some-college,Married-civ-spouse,Prof-specialty,Husband,White,Male,7688,0,40,United-States,>50K
28837,56,Private,175127,10th,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States,<=50K
12128,45,Self-emp-inc,281911,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,48,United-States,<=50K


In this dataset we have only one feature, where the order matters - Education, so it will be transformed with using OrdinalEncoder. For all the rest of categorical features the order does not matter, hense we can apply OneHotEncoder() to them.

# 1st model 
### Inital model without changes in data

In [8]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [9]:
data_no_transform = data_train.copy()
preprocesseded_data = preprocess_data(data_no_transform, numerical_features_list, categorical_features_list)
preprocesseded_data

Unnamed: 0,Income,ordinal__Education,stand scaler__Age,stand scaler__final weight,stand scaler__Capital Gain,stand scaler__Capital Loss,stand scaler__Hours per week,onehot__Workclass_ Local-gov,onehot__Workclass_ Never-worked,onehot__Workclass_ Private,...,onehot__Country_ Puerto-Rico,onehot__Country_ Scotland,onehot__Country_ South,onehot__Country_ Taiwan,onehot__Country_ Thailand,onehot__Country_ Trinadad&Tobago,onehot__Country_ United-States,onehot__Country_ Vietnam,onehot__Country_ Yugoslavia,onehot__Country_nan
0,1,9.0,0.468761,0.075746,-0.145319,-0.216219,1.175266,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,3.0,0.248863,0.324574,-0.145319,-0.216219,-0.843694,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,9.0,-0.630730,-1.614023,0.882204,-0.216219,-0.036110,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0,5.0,1.275054,-0.135013,-0.145319,-0.216219,-0.036110,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,8.0,0.468761,0.878526,-0.145319,-0.216219,0.609957,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,1,11.0,0.908557,0.415702,-0.145319,-0.216219,0.367682,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
26044,0,12.0,0.615360,-0.525663,0.240269,-0.216219,1.982850,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26045,0,9.0,-0.850628,-0.761090,-0.145319,-0.216219,1.579058,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
26046,0,9.0,0.102264,-0.940887,-0.145319,-0.216219,-0.036110,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [10]:
# A function, that performs all needed data preparation and feeds it to LogisticRegression


def get_LR_performance(data: pd.DataFrame, target: str = 'Income'):
    """The function performs data preprocessing, creates pipeline with LogisticRegression model, 
        and prints it's performance out

    Args:
        data: full dataset with features and target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder
    """
    X = data.drop(columns=[target])
    y = data[target]

    model = LogisticRegression(max_iter = 10000)
    model.fit(X, y)
    y_pred = model.predict(X)
    
    scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    f1_mean_score = round(np.mean(scores),2)
    f1_std = round(np.std(scores),2)
    
    report = classification_report(y, y_pred)
   
    print(f'f1 score: mean = {f1_mean_score} | std = {f1_std}')
    print(report)


In [11]:
get_LR_performance(preprocesseded_data)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19806
           1       0.74      0.60      0.66      6242

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.78     26048
weighted avg       0.85      0.85      0.85     26048



### Let's now understand significance of features with the help of Logit() function from statsmodel

In [12]:
def logit_summary(data: pd.DataFrame, target: str = 'Income'):
    """ Function applies Logit() function to already preprocessed datan. After that retuns summary which contains featues significances

    Args:
        data (pd.DataFrame): DataFrame of features and target

    Returns:
        Summary: summary of statsmodel Logit() model with the help of which the decision about 
                keeping or modifying/removing a feature can be made
    """

    X = data.drop(columns = [target])
    y = data[target]

    model = Logit(y, X).fit_regularized()
    summary = model.summary()
    
    return summary

In [13]:
summary = logit_summary(preprocesseded_data)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3187854254630295
            Iterations: 610
            Function evaluations: 612
            Gradient evaluations: 610


0,1,2,3
Dep. Variable:,Income,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25964.0
Method:,MLE,Df Model:,83.0
Date:,"Mon, 01 May 2023",Pseudo R-squ.:,0.4211
Time:,18:37:34,Log-Likelihood:,-8303.7
converged:,True,LL-Null:,-14344.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2758,0.010,26.450,0.000,0.255,0.296
stand scaler__Age,0.3575,0.024,14.785,0.000,0.310,0.405
stand scaler__final weight,0.0695,0.020,3.449,0.001,0.030,0.109
stand scaler__Capital Gain,2.3448,0.085,27.448,0.000,2.177,2.512
stand scaler__Capital Loss,0.2637,0.017,15.747,0.000,0.231,0.297
stand scaler__Hours per week,0.3588,0.022,16.369,0.000,0.316,0.402
onehot__Workclass_ Local-gov,-0.7471,0.124,-6.032,0.000,-0.990,-0.504
onehot__Workclass_ Never-worked,-1.6719,2.62e+14,-6.37e-15,1.000,-5.14e+14,5.14e+14
onehot__Workclass_ Private,-0.5410,0.100,-5.414,0.000,-0.737,-0.345


# 2nd model
### Same model, but without 'final weight'

As we remember from EDA, **'final weight'** feature did not pass the significance border. Let's try to remove it and check the performance

In [14]:
preprocesseded_data.columns[:5]

Index(['Income', 'ordinal__Education', 'stand scaler__Age',
       'stand scaler__final weight', 'stand scaler__Capital Gain'],
      dtype='object')

In [15]:
# to save some computational complexity and because we use the same datase, we can just remove the 'final weight' feature right from the prepdossed_data df:

preprocesseded_data_no_fw = preprocesseded_data.copy().drop(['stand scaler__final weight'], axis = 'columns')
get_LR_performance(preprocesseded_data_no_fw)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19806
           1       0.74      0.60      0.66      6242

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.78     26048
weighted avg       0.85      0.85      0.85     26048



#### Performance in general has not changed, let's check if features' significances have changed

In [16]:
summary = logit_summary(preprocesseded_data_no_fw)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.31901244002227713
            Iterations: 609
            Function evaluations: 611
            Gradient evaluations: 609


0,1,2,3
Dep. Variable:,Income,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25965.0
Method:,MLE,Df Model:,82.0
Date:,"Mon, 01 May 2023",Pseudo R-squ.:,0.4207
Time:,18:38:27,Log-Likelihood:,-8309.6
converged:,True,LL-Null:,-14344.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2756,0.010,26.912,0.000,0.256,0.296
stand scaler__Age,0.3525,0.024,14.415,0.000,0.305,0.400
stand scaler__Capital Gain,2.3451,0.085,27.490,0.000,2.178,2.512
stand scaler__Capital Loss,0.2631,0.017,15.732,0.000,0.230,0.296
stand scaler__Hours per week,0.3570,0.022,16.279,0.000,0.314,0.400
onehot__Workclass_ Local-gov,-0.7434,0.123,-6.029,0.000,-0.985,-0.502
onehot__Workclass_ Never-worked,-1.7573,,,,,
onehot__Workclass_ Private,-0.5373,0.102,-5.263,0.000,-0.737,-0.337
onehot__Workclass_ Self-emp-inc,-0.3890,0.135,-2.881,0.004,-0.654,-0.124


According to Logit() results, all of numerical features are statistically significant. Some categoties in a couple of categotical features have to be clustered as they are insignificant. 

Assumption 1. Workclasses representatives, that do not work or work without pay will have less than 50k, so can become one cluster.

Assumption 2. Single people tend to earn more, as they have more free time for career development; so values of Marital Status feature can be clustered to Sigle and Married 

Assumption 3. Occupation has no impact on Income, as all categories are insignificant, so could be removed from the model. But before, they will be left like this, as from the EDA we saw that this feature is significant

Assumption 4. All categories of Relationship and Sex features are significant.

Assumption 5. Most of Ethnic Groups have no impact on target, it's possible to cluster them according to their inbalancy: white and others

# 3rd model
### Clustering categories of features


In [17]:
data_clustered = data_train.copy()

data_clustered = cluster_categorical(data_clustered)

In [18]:
data_clustered.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
451,45,Self-emp-inc,197332,Some-college,Married,Craft-repair,Family,White,Male,0,0,55,Developed,>50K
29927,42,Private,223548,7th-8th,Married,Handlers-cleaners,Family,White,Male,0,0,30,Developing,<=50K
23423,30,Private,19302,Some-college,Married,Prof-specialty,Family,White,Male,7688,0,40,Developed,>50K
28837,56,Private,175127,10th,Married,Handlers-cleaners,Family,White,Male,0,0,40,Developed,<=50K
12128,45,Self-emp-inc,281911,HS-grad,Married,Craft-repair,Family,White,Male,0,0,48,Developed,<=50K


Let's now apply the pipeline to updated dataset

In [19]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [20]:
data_clustered_and_preprocessed = preprocess_data(data_clustered, numerical_features_list, categorical_features_list)

get_LR_performance(data_clustered_and_preprocessed)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19806
           1       0.73      0.59      0.66      6242

    accuracy                           0.85     26048
   macro avg       0.81      0.76      0.78     26048
weighted avg       0.84      0.85      0.85     26048



In [21]:
summary = logit_summary(data_clustered_and_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.32691242220708294
            Iterations: 262
            Function evaluations: 264
            Gradient evaluations: 262


0,1,2,3
Dep. Variable:,Income,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26013.0
Method:,MLE,Df Model:,34.0
Date:,"Mon, 01 May 2023",Pseudo R-squ.:,0.4063
Time:,18:38:57,Log-Likelihood:,-8515.4
converged:,True,LL-Null:,-14344.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2362,0.009,25.551,0.000,0.218,0.254
stand scaler__Age,0.3649,0.023,15.939,0.000,0.320,0.410
stand scaler__Capital Gain,2.3779,0.085,27.878,0.000,2.211,2.545
stand scaler__Capital Loss,0.2684,0.017,16.208,0.000,0.236,0.301
stand scaler__Hours per week,0.3617,0.022,16.738,0.000,0.319,0.404
onehot__Workclass_ Local-gov,-1.1192,0.115,-9.739,0.000,-1.344,-0.894
onehot__Workclass_ Private,-0.9125,0.093,-9.845,0.000,-1.094,-0.731
onehot__Workclass_ Self-emp-inc,-0.7213,0.129,-5.587,0.000,-0.974,-0.468
onehot__Workclass_ Self-emp-not-inc,-1.3485,0.113,-11.985,0.000,-1.569,-1.128


1. Workclass 'Without pay' is still innsignificant, will try to remove these instances (there is a small amount of them)
2. Some Occupations are insignifficant
3. Relationships became signifficant
4. Sex became insignifficant
5. All NaNs are insignifficant

Let's check unique values of Ocupation:

In [22]:
data_clustered['Occupation'].value_counts()

Occupation
 Prof-specialty       3350
 Craft-repair         3289
 Exec-managerial      3212
 Adm-clerical         3016
 Sales                2891
 Other-service        2646
 Machine-op-inspct    1590
 Transport-moving     1290
 Handlers-cleaners    1105
 Farming-fishing       795
 Tech-support          708
 Protective-serv       540
 Priv-house-serv       121
 Armed-Forces            8
Name: count, dtype: int64

Armed Forces is the smallest group, but is insignifficant in all configurations of the model. Let's try to remove it later

# 4th model

Let's try to remove missing data

In [23]:
data_no_nan = data_train.copy()
data_no_nan = data_no_nan.dropna(how='any')

In [24]:
data_no_nan.shape

(24112, 14)

In [26]:
data_no_nan_clustered = cluster_categorical(data_no_nan)
data_no_nan_clustered.head()


Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
451,45,Self-emp-inc,197332,Some-college,Married,Craft-repair,Family,White,Male,0,0,55,Developed,>50K
29927,42,Private,223548,7th-8th,Married,Handlers-cleaners,Family,White,Male,0,0,30,Developing,<=50K
23423,30,Private,19302,Some-college,Married,Prof-specialty,Family,White,Male,7688,0,40,Developed,>50K
28837,56,Private,175127,10th,Married,Handlers-cleaners,Family,White,Male,0,0,40,Developed,<=50K
12128,45,Self-emp-inc,281911,HS-grad,Married,Craft-repair,Family,White,Male,0,0,48,Developed,<=50K


In [27]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [28]:
data_no_nan_clustered_and_preprocessed = preprocess_data(data_no_nan_clustered, numerical_features_list, categorical_features_list)
get_LR_performance(data_no_nan_clustered_and_preprocessed)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     18150
           1       0.74      0.60      0.66      5962

    accuracy                           0.85     24112
   macro avg       0.81      0.76      0.78     24112
weighted avg       0.84      0.85      0.84     24112



In [29]:
summary = logit_summary(data_no_nan_clustered_and_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3332187380378785
            Iterations: 248
            Function evaluations: 250
            Gradient evaluations: 248


0,1,2,3
Dep. Variable:,Income,No. Observations:,24112.0
Model:,Logit,Df Residuals:,24079.0
Method:,MLE,Df Model:,32.0
Date:,"Mon, 01 May 2023",Pseudo R-squ.:,0.4042
Time:,18:39:51,Log-Likelihood:,-8034.6
converged:,True,LL-Null:,-13486.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2379,0.010,24.517,0.000,0.219,0.257
stand scaler__Age,0.3635,0.023,15.785,0.000,0.318,0.409
stand scaler__final weight,0.0891,0.020,4.389,0.000,0.049,0.129
stand scaler__Capital Gain,2.4195,0.089,27.041,0.000,2.244,2.595
stand scaler__Capital Loss,0.2651,0.017,15.423,0.000,0.231,0.299
stand scaler__Hours per week,0.3553,0.022,16.060,0.000,0.312,0.399
onehot__Workclass_ Local-gov,-1.1198,0.117,-9.608,0.000,-1.348,-0.891
onehot__Workclass_ Private,-0.9014,0.094,-9.558,0.000,-1.086,-0.717
onehot__Workclass_ Self-emp-inc,-0.6936,0.132,-5.269,0.000,-0.952,-0.436


This approach to data preprocessing gave us the best result so far. We saved computational complexity while redused the dimentionality, but the performance stayed. However, Sex feature became insignifficant. It's still not a good model though
# 5th model
### Let's try to apply ln() function to 'Age', 'Capital Gain' and 'Capital Loss' festures (as they are heavy tailed) before Standard Scaler to normalize it

In [30]:
data_logged = data_train.copy()
data_logged = data_logged.dropna(how='any')

In [31]:
data_logged['Capital Gain'] = np.log(1+ data_logged['Capital Gain'])
data_logged['Capital Loss'] = np.log(1+ data_logged['Capital Loss'])
data_logged['Age'] = np.log(data_logged['Age'])

data_logged = cluster_categorical(data_logged)

In [32]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [33]:
data_logged_and_preprocessed = preprocess_data(data_logged, numerical_features_list, categorical_features_list)
get_LR_performance(data_logged_and_preprocessed)

f1 score: mean = 0.77 | std = 0.01
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     18150
           1       0.72      0.59      0.65      5962

    accuracy                           0.84     24112
   macro avg       0.80      0.76      0.77     24112
weighted avg       0.84      0.84      0.84     24112



In [35]:
summary = logit_summary(data_logged_and_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3437267129717219
            Iterations: 239
            Function evaluations: 241
            Gradient evaluations: 239


0,1,2,3
Dep. Variable:,Income,No. Observations:,24112.0
Model:,Logit,Df Residuals:,24080.0
Method:,MLE,Df Model:,31.0
Date:,"Mon, 01 May 2023",Pseudo R-squ.:,0.3854
Time:,18:43:13,Log-Likelihood:,-8287.9
converged:,True,LL-Null:,-13486.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2356,0.009,24.833,0.000,0.217,0.254
stand scaler__Age,0.4765,0.025,18.793,0.000,0.427,0.526
stand scaler__Capital Gain,0.5190,0.018,28.457,0.000,0.483,0.555
stand scaler__Capital Loss,0.2424,0.017,14.198,0.000,0.209,0.276
stand scaler__Hours per week,0.3491,0.022,16.122,0.000,0.307,0.392
onehot__Workclass_ Local-gov,-1.1816,0.115,-10.315,0.000,-1.406,-0.957
onehot__Workclass_ Private,-0.9037,0.093,-9.769,0.000,-1.085,-0.722
onehot__Workclass_ Self-emp-inc,-0.6883,0.129,-5.352,0.000,-0.940,-0.436
onehot__Workclass_ Self-emp-not-inc,-1.3308,0.112,-11.890,0.000,-1.550,-1.111


Not better either.

# 6th model
### Another try is to cluster 'Hours per week' feature to part-time, fulltime and overtime workers with fulltime value for 40 hours

In [36]:
data_hpw = data_train.copy()
data_hpw = data_hpw.dropna(how='any')
data_hpw['Hours per week'] = np.where(data_hpw['Hours per week'] == 40, 'fulltime', 
                                   (np.where(data_hpw['Hours per week'] < 40, 'part-time', 'overtime')))

data_hpw = cluster_categorical(data_hpw)

In [37]:
data_hpw.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
451,45,Self-emp-inc,197332,Some-college,Married,Craft-repair,Family,White,Male,0,0,overtime,Developed,>50K
29927,42,Private,223548,7th-8th,Married,Handlers-cleaners,Family,White,Male,0,0,part-time,Developing,<=50K
23423,30,Private,19302,Some-college,Married,Prof-specialty,Family,White,Male,7688,0,fulltime,Developed,>50K
28837,56,Private,175127,10th,Married,Handlers-cleaners,Family,White,Male,0,0,fulltime,Developed,<=50K
12128,45,Self-emp-inc,281911,HS-grad,Married,Craft-repair,Family,White,Male,0,0,overtime,Developed,<=50K


In [38]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Hours per week', 'Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [39]:
data_hpw_preprocessed = preprocess_data(data_hpw, numerical_features_list, categorical_features_list)
get_LR_performance(data_hpw_preprocessed)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     18150
           1       0.74      0.60      0.66      5962

    accuracy                           0.85     24112
   macro avg       0.81      0.76      0.78     24112
weighted avg       0.84      0.85      0.84     24112



In [40]:
summary = logit_summary(data_hpw_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3318917899341232
            Iterations: 254
            Function evaluations: 256
            Gradient evaluations: 254


0,1,2,3
Dep. Variable:,Income,No. Observations:,24112.0
Model:,Logit,Df Residuals:,24079.0
Method:,MLE,Df Model:,32.0
Date:,"Mon, 01 May 2023",Pseudo R-squ.:,0.4066
Time:,18:43:29,Log-Likelihood:,-8002.6
converged:,True,LL-Null:,-13486.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2346,0.010,24.104,0.000,0.216,0.254
stand scaler__Age,0.3653,0.023,15.820,0.000,0.320,0.411
stand scaler__Capital Gain,2.4140,0.090,26.931,0.000,2.238,2.590
stand scaler__Capital Loss,0.2638,0.017,15.344,0.000,0.230,0.297
onehot__Hours per week_overtime,0.4894,0.044,11.000,0.000,0.402,0.577
onehot__Hours per week_part-time,-0.7307,0.068,-10.697,0.000,-0.865,-0.597
onehot__Workclass_ Local-gov,-1.0894,0.117,-9.319,0.000,-1.319,-0.860
onehot__Workclass_ Private,-0.8897,0.095,-9.408,0.000,-1.075,-0.704
onehot__Workclass_ Self-emp-inc,-0.6272,0.132,-4.751,0.000,-0.886,-0.368


Hours per week work great like this, both features are signifficant, but it did not affect model's accuracy and Sex feature became even more insignifficant compared to the previous model.

# 7th model
### Lets now try to cluster all minority categories of imbalanced features together

In [41]:
data_cluster_imbalanced = data_train.copy()

def balance_predictors(data):
    data['Workclass'] = np.where(data['Workclass'] != ' Private', 'Other', data['Workclass'])
    
    data.loc[
        lambda x: x["Marital Status"].isin([' Widowed', ' Separated', ' Married-spouse-absent', ' Never-married', ' Divorced']), "Marital Status"
    ] = "Single" 

    data.loc[
        lambda x: x["Occupation"].isin([' Craft-repair', ' Other-service', ' Priv-house-serv', ' Protective-serv']), "Occupation"
    ] = "Services"

    data.loc[
        lambda x: x["Relationship"].isin([' Husband', ' Wife', ' Own-child']), "Relationship"
    ] = "Family"
    data.loc[
        lambda x: x["Relationship"].isin([' Not-in-family', ' Unmarried', ' Other-relative']), "Relationship"
    ] = "Not-in-Family"
    
    data['Ethnic group'] = np.where(data['Ethnic group'] != ' White', 'Other', data['Ethnic group'])
    data['Hours per week'] = np.where(data['Hours per week'] == 40, 'fulltime', (np.where(data['Hours per week'] < 40, 'part-time', 'overtime')))
    data['Country'] = np.where(data['Country'] != ' United-States', 'Other', data['Country'])
    
balance_predictors(data_cluster_imbalanced)
data_cluster_imbalanced.sample(3)

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
30240,18,Private,189924,HS-grad,Single,Sales,Family,White,Female,0,0,part-time,United-States,<=50K
23388,62,Other,113234,Masters,Married-civ-spouse,,Family,White,Female,0,0,fulltime,United-States,<=50K
16112,58,Private,147653,Some-college,Married-civ-spouse,Adm-clerical,Family,White,Female,0,0,part-time,United-States,<=50K


In [42]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex',  'Hours per week', 'Country']

In [43]:
data_cluster_imbalanced_preprocessed = preprocess_data(data_cluster_imbalanced, numerical_features_list, categorical_features_list)
get_LR_performance(data_cluster_imbalanced_preprocessed)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     19806
           1       0.73      0.58      0.65      6242

    accuracy                           0.85     26048
   macro avg       0.80      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048



In [44]:
summary = logit_summary(data_cluster_imbalanced_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.32568536303427814
            Iterations: 176
            Function evaluations: 179
            Gradient evaluations: 176


0,1,2,3
Dep. Variable:,Income,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26024.0
Method:,MLE,Df Model:,23.0
Date:,"Mon, 01 May 2023",Pseudo R-squ.:,0.4086
Time:,18:44:19,Log-Likelihood:,-8483.5
converged:,True,LL-Null:,-14344.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2794,0.010,28.142,0.000,0.260,0.299
stand scaler__Age,0.3879,0.023,16.858,0.000,0.343,0.433
stand scaler__Capital Gain,2.3494,0.085,27.785,0.000,2.184,2.515
stand scaler__Capital Loss,0.2664,0.017,16.064,0.000,0.234,0.299
onehot__Workclass_Other,-0.0624,0.044,-1.408,0.159,-0.149,0.024
onehot__Marital Status_ Married-civ-spouse,-2.9363,0.119,-24.761,0.000,-3.169,-2.704
onehot__Marital Status_Single,-5.9892,0.166,-36.078,0.000,-6.315,-5.664
onehot__Occupation_ Armed-Forces,-1.0821,1.410,-0.767,0.443,-3.846,1.682
onehot__Occupation_ Exec-managerial,0.5879,0.082,7.169,0.000,0.427,0.749


**So far, we have checked 7 models, none of them gave any performance improvement, although we reached faster convergence. Let's now try to build the last model, that would contain all best ideas of 7 models above:**

# Final Model

In [45]:
data_final = data_train.copy()
data_final = data_final.dropna(how='any')

In [46]:
data_final['Occupation'].value_counts()

Occupation
 Prof-specialty       3266
 Craft-repair         3234
 Exec-managerial      3155
 Adm-clerical         2978
 Sales                2836
 Other-service        2578
 Machine-op-inspct    1562
 Transport-moving     1269
 Handlers-cleaners    1085
 Farming-fishing       791
 Tech-support          697
 Protective-serv       537
 Priv-house-serv       116
 Armed-Forces            8
Name: count, dtype: int64

In [47]:
data_final['Hours per week'] = np.where(data_final['Hours per week'] == 40, 'fulltime', 
                                   (np.where(data_final['Hours per week'] < 40, 'part-time', 'overtime')))

In [48]:
data_final = cluster_categorical(data_final)
data_final = data_final[~data_final['Workclass'].isin([' Never-worked', ' Without-pay'])]
data_final = data_final[~data_final['Occupation'].isin([' Armed-Forces'])]
data_final.loc[
        lambda x: x["Occupation"].isin([' Craft-repair', ' Other-service', ' Priv-house-serv', ' Protective-serv']), "Occupation"
    ] = "Services"

data_final.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
451,45,Self-emp-inc,197332,Some-college,Married,Services,Family,White,Male,0,0,overtime,Developed,>50K
29927,42,Private,223548,7th-8th,Married,Handlers-cleaners,Family,White,Male,0,0,part-time,Developing,<=50K
23423,30,Private,19302,Some-college,Married,Prof-specialty,Family,White,Male,7688,0,fulltime,Developed,>50K
28837,56,Private,175127,10th,Married,Handlers-cleaners,Family,White,Male,0,0,fulltime,Developed,<=50K
12128,45,Self-emp-inc,281911,HS-grad,Married,Services,Family,White,Male,0,0,overtime,Developed,<=50K


In [49]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status',  'Occupation',
                             'Relationship', 'Ethnic group', 'Hours per week', 'Country']

In [50]:
data_final_preprocessed = preprocess_data(data_final, numerical_features_list, categorical_features_list)
get_LR_performance(data_final_preprocessed)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     18132
           1       0.74      0.59      0.66      5961

    accuracy                           0.85     24093
   macro avg       0.81      0.76      0.78     24093
weighted avg       0.84      0.85      0.84     24093



In [51]:
summary = logit_summary(data_final_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.33344754684095257
            Iterations: 168
            Function evaluations: 170
            Gradient evaluations: 168


0,1,2,3
Dep. Variable:,Income,No. Observations:,24093.0
Model:,Logit,Df Residuals:,24065.0
Method:,MLE,Df Model:,27.0
Date:,"Mon, 01 May 2023",Pseudo R-squ.:,0.404
Time:,18:44:29,Log-Likelihood:,-8033.8
converged:,True,LL-Null:,-13480.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2380,0.010,24.660,0.000,0.219,0.257
stand scaler__Age,0.3663,0.023,15.930,0.000,0.321,0.411
stand scaler__final weight,0.0893,0.020,4.406,0.000,0.050,0.129
stand scaler__Capital Gain,2.4163,0.090,26.913,0.000,2.240,2.592
stand scaler__Capital Loss,0.2644,0.017,15.385,0.000,0.231,0.298
onehot__Workclass_ Local-gov,-0.9771,0.115,-8.470,0.000,-1.203,-0.751
onehot__Workclass_ Private,-0.9227,0.095,-9.747,0.000,-1.108,-0.737
onehot__Workclass_ Self-emp-inc,-0.6481,0.132,-4.906,0.000,-0.907,-0.389
onehot__Workclass_ Self-emp-not-inc,-1.2821,0.115,-11.104,0.000,-1.508,-1.056


#### Even though the model performance has not improved compared to initial model, we were able to the reduce the number of iterations till convergence twice. Also we've build a model, where all features are statistically signifficant

### List of changes to initial data:
##### 1. Remove all missing data
##### 2. Remove classes, that are poorly represented, including Workclass: 'Never worked' and 'Without pay'; Occupeation: 'Armed Forces'
##### 3. Clustered most of categorical features to reduce dimentionality (from **83** in the initial model to **28** in the final model)