In [1]:
import pandas as pd
import numpy as np
from common_functions import cluster_categorical, preprocess_data
from statsmodels.api import Logit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

Here the results from 'Adult EDA" file are going to be used

In [2]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)
df = df.replace(to_replace= ' ?', value = np.nan)

In [3]:
df = df.drop(['Education-Num'], axis = 'columns')

In [4]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [5]:
for col in categorical_features_list:
    print(f'{col}\n{df[col].value_counts()}\n')

Workclass
Workclass
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: count, dtype: int64

Marital Status
Marital Status
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: count, dtype: int64

Occupation
Occupation
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: count, dtype: int64

Relationship
Relationship
 Hu

As in the whole dataset (X) we have only one 'Holand-Netherlands' value in 'Country' column, we have to process it separately, because in case if it appears in the test set, model will not be able to predict target for such a record. For the initial model, where there are no changes in data, this observation will be removed

In [6]:
print(df.shape)
df_no_nl = df.copy()
df_no_nl.drop(df_no_nl.loc[df['Country']==' Holand-Netherlands'].index, inplace=True)
print(df_no_nl.shape)

(32561, 14)
(32560, 14)


In [7]:
data_train, data_test = train_test_split(df_no_nl, test_size = 0.2)
print(data_train.shape)
data_train.head()

(26048, 14)


Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
6491,46,State-gov,353012,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
22605,42,State-gov,136996,Some-college,Married-civ-spouse,Protective-serv,Husband,Black,Male,0,0,48,United-States,<=50K
297,39,,157443,Masters,Married-civ-spouse,,Wife,Asian-Pac-Islander,Female,3464,0,40,,<=50K
16728,44,Self-emp-not-inc,90021,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,30,United-States,<=50K
29334,27,Private,89598,Some-college,Never-married,Adm-clerical,Unmarried,White,Female,0,0,60,United-States,<=50K


In this dataset we have only one feature, where the order matters - Education, so it will be transformed with using OrdinalEncoder. For all the rest of categorical features the order does not matter, hense we can apply OneHotEncoder() to them.

# 1st model 
### Inital model without changes in data

In [8]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [9]:
data_no_transform = data_train.copy()
preprocesseded_data = preprocess_data(data_no_transform, numerical_features_list, categorical_features_list)
preprocesseded_data

Unnamed: 0,Income_ >50K,ordinal__Education,stand scaler__Age,stand scaler__final weight,stand scaler__Capital Gain,stand scaler__Capital Loss,stand scaler__Hours per week,onehot__Workclass_ Local-gov,onehot__Workclass_ Never-worked,onehot__Workclass_ Private,...,onehot__Country_ Puerto-Rico,onehot__Country_ Scotland,onehot__Country_ South,onehot__Country_ Taiwan,onehot__Country_ Thailand,onehot__Country_ Trinadad&Tobago,onehot__Country_ United-States,onehot__Country_ Vietnam,onehot__Country_ Yugoslavia,onehot__Country_nan
0,0,8.0,0.547112,1.545789,-0.144769,-0.216052,-0.035457,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,9.0,0.254053,-0.498595,-0.144769,-0.216052,0.611995,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0,13.0,0.034258,-0.305084,0.332084,-0.216052,-0.035457,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,8.0,0.400582,-0.943168,-0.144769,-0.216052,-0.844772,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,9.0,-0.844919,-0.947171,-0.144769,-0.216052,1.583173,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0,8.0,0.693642,-0.422457,-0.144769,-0.216052,-0.035457,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26044,0,8.0,2.085673,-0.823240,0.004730,-0.216052,-0.440115,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
26045,0,8.0,-1.064713,-0.667433,-0.144769,-0.216052,-0.035457,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
26046,0,9.0,-0.771654,0.299423,-0.144769,-0.216052,-0.035457,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# A function, that performs all needed data preparation and feeds it to LogisticRegression


def get_LR_performance(data):
    """The function performs data preprocessing, creates pipeline with LogisticRegression model, 
        and prints it's performance out

    Args:
        data: full dataset with features and target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder
    """
    X = data.iloc[:, 1:]
    y = data.iloc[:,0]

    model = LogisticRegression(max_iter = 10000)
    model.fit(X, y)
    y_pred = model.predict(X)
    
    scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    f1_mean_score = round(np.mean(scores),2)
    f1_std = round(np.std(scores),2)
    
    report = classification_report(y, y_pred)
   
    print(f'f1 score: mean = {f1_mean_score} | std = {f1_std}')
    print(report)


In [11]:
get_LR_performance(preprocesseded_data)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19871
           1       0.74      0.60      0.67      6177

    accuracy                           0.86     26048
   macro avg       0.81      0.77      0.79     26048
weighted avg       0.85      0.86      0.85     26048



### Let's now understand significance of features with the help of Logit() function from statsmodel

In [12]:
def logit_summary(data):
    """ Function applies Logit() function to already preprocessed datan. After that retuns summary which contains featues significances

    Args:
        data (pd.DataFrame): DataFrame of features and target

    Returns:
        Summary: summary of statsmodel Logit() model with the help of which the decision about 
                keeping or modifying/removing a feature can be made
    """

    X = data.iloc[:, 1:]
    y = data.iloc[:, 0]

    model = Logit(y, X).fit_regularized()
    summary = model.summary()
    
    return summary

In [13]:
summary = logit_summary(preprocesseded_data)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3131188837432131
            Iterations: 631
            Function evaluations: 633
            Gradient evaluations: 631


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25964.0
Method:,MLE,Df Model:,83.0
Date:,"Sat, 29 Apr 2023",Pseudo R-squ.:,0.4284
Time:,14:08:28,Log-Likelihood:,-8156.1
converged:,True,LL-Null:,-14268.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2777,0.010,26.730,0.000,0.257,0.298
stand scaler__Age,0.3320,0.025,13.286,0.000,0.283,0.381
stand scaler__final weight,0.0747,0.021,3.636,0.000,0.034,0.115
stand scaler__Capital Gain,2.2976,0.084,27.330,0.000,2.133,2.462
stand scaler__Capital Loss,0.2649,0.017,15.864,0.000,0.232,0.298
stand scaler__Hours per week,0.3790,0.022,16.917,0.000,0.335,0.423
onehot__Workclass_ Local-gov,-0.7600,0.125,-6.066,0.000,-1.006,-0.514
onehot__Workclass_ Never-worked,-0.2510,5e+06,-5.02e-08,1.000,-9.8e+06,9.8e+06
onehot__Workclass_ Private,-0.5823,0.104,-5.583,0.000,-0.787,-0.378


# 2nd model
### Same model, but without 'final weight'

As we remember from EDA, **'final weight'** feature did not pass the significance border. Let's try to remove it and check the performance

In [14]:
preprocesseded_data.columns[:5]

Index(['Income_ >50K', 'ordinal__Education', 'stand scaler__Age',
       'stand scaler__final weight', 'stand scaler__Capital Gain'],
      dtype='object')

In [15]:
# to save some computational complexity and because we use the same datase, we can just remove the 'final weight' feature right from the prepdossed_data df:

preprocesseded_data_no_fw = preprocesseded_data.copy().drop(['stand scaler__final weight'], axis = 'columns')
get_LR_performance(preprocesseded_data_no_fw)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19871
           1       0.74      0.60      0.66      6177

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.78     26048
weighted avg       0.85      0.85      0.85     26048



#### Performance in general has not changed, let's check if features' significances have changed

In [16]:
summary = logit_summary(preprocesseded_data_no_fw)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3133705822203545
            Iterations: 609
            Function evaluations: 611
            Gradient evaluations: 609


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25965.0
Method:,MLE,Df Model:,82.0
Date:,"Sat, 29 Apr 2023",Pseudo R-squ.:,0.4279
Time:,14:08:34,Log-Likelihood:,-8162.7
converged:,True,LL-Null:,-14268.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2771,0.010,26.698,0.000,0.257,0.297
stand scaler__Age,0.3267,0.025,13.109,0.000,0.278,0.376
stand scaler__Capital Gain,2.2961,0.084,27.341,0.000,2.132,2.461
stand scaler__Capital Loss,0.2645,0.017,15.861,0.000,0.232,0.297
stand scaler__Hours per week,0.3772,0.022,16.854,0.000,0.333,0.421
onehot__Workclass_ Local-gov,-0.7610,0.125,-6.079,0.000,-1.006,-0.516
onehot__Workclass_ Never-worked,-0.2470,,,,,
onehot__Workclass_ Private,-0.5799,0.104,-5.566,0.000,-0.784,-0.376
onehot__Workclass_ Self-emp-inc,-0.4011,0.138,-2.915,0.004,-0.671,-0.131


According to Logit() results, all of numerical features are statistically significant. Some categoties in a couple of categotical features have to be clustered as they are insignificant. 

Assumption 1. Workclasses representatives, that do not work or work without pay will have less than 50k, so can become one cluster.

Assumption 2. Single people tend to earn more, as they have more free time for career development; so values of Marital Status feature can be clustered to Sigle and Married 

Assumption 3. Occupation has no impact on Income, as all categories are insignificant, so could be removed from the model. But before, they will be left like this, as from the EDA we saw that this feature is significant

Assumption 4. All categories of Relationship and Sex features are significant.

Assumption 5. Most of Ethnic Groups have no impact on target, it's possible to cluster them according to their inbalancy: white and others

# 3rd model
### Clustering categories of features


In [17]:
data_clustered = data_train.copy()

data_clustered = cluster_categorical(data_clustered)

In [18]:
data_clustered.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
6491,46,State-gov,353012,HS-grad,Married,Transport-moving,Family,White,Male,0,0,40,Developed,<=50K
22605,42,State-gov,136996,Some-college,Married,Protective-serv,Family,Black,Male,0,0,48,Developed,<=50K
297,39,,157443,Masters,Married,,Family,Asian-Pac-Islander,Female,3464,0,40,,<=50K
16728,44,Self-emp-not-inc,90021,HS-grad,Married,Craft-repair,Family,White,Male,0,0,30,Developed,<=50K
29334,27,Private,89598,Some-college,Single,Adm-clerical,Not-in-Family,White,Female,0,0,60,Developed,<=50K


Let's now apply the pipeline to updated dataset

In [19]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [20]:
data_clustered_and_preprocessed = preprocess_data(data_clustered, numerical_features_list, categorical_features_list)

get_LR_performance(data_clustered_and_preprocessed)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19871
           1       0.74      0.60      0.66      6177

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.78     26048
weighted avg       0.85      0.85      0.85     26048



In [21]:
summary = logit_summary(data_clustered_and_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3208294669899564
            Iterations: 265
            Function evaluations: 267
            Gradient evaluations: 265


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26013.0
Method:,MLE,Df Model:,34.0
Date:,"Sat, 29 Apr 2023",Pseudo R-squ.:,0.4143
Time:,14:08:36,Log-Likelihood:,-8357.0
converged:,True,LL-Null:,-14268.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2367,0.009,25.298,0.000,0.218,0.255
stand scaler__Age,0.3420,0.023,14.698,0.000,0.296,0.388
stand scaler__Capital Gain,2.3373,0.084,27.984,0.000,2.174,2.501
stand scaler__Capital Loss,0.2717,0.017,16.431,0.000,0.239,0.304
stand scaler__Hours per week,0.3811,0.022,17.300,0.000,0.338,0.424
onehot__Workclass_ Local-gov,-1.1474,0.117,-9.823,0.000,-1.376,-0.918
onehot__Workclass_ Private,-0.9800,0.095,-10.363,0.000,-1.165,-0.795
onehot__Workclass_ Self-emp-inc,-0.7700,0.131,-5.870,0.000,-1.027,-0.513
onehot__Workclass_ Self-emp-not-inc,-1.4258,0.114,-12.460,0.000,-1.650,-1.201


1. Workclass 'Without pay' is still innsignificant, will try to remove these instances (there is a small amount of them)
2. Some Occupations are insignifficant
3. Relationships became signifficant
4. Sex became insignifficant
5. All NaNs are insignifficant

Let's check unique values of Ocupation:

In [22]:
data_clustered['Occupation'].value_counts()

Occupation
 Exec-managerial      3285
 Craft-repair         3260
 Prof-specialty       3239
 Adm-clerical         3067
 Sales                2892
 Other-service        2663
 Machine-op-inspct    1616
 Transport-moving     1288
 Handlers-cleaners    1108
 Farming-fishing       783
 Tech-support          727
 Protective-serv       518
 Priv-house-serv       122
 Armed-Forces            7
Name: count, dtype: int64

Armed Forces is the smallest group, but is insignifficant in all configurations of the model. Let's try to remove it later

# 4th model

Let's try to remove missing data

In [23]:
data_no_nan = data_train.copy()
data_no_nan = data_no_nan.dropna(how='any')

In [24]:
data_no_nan.shape

(24126, 14)

In [25]:
data_no_nan_clustered = cluster_categorical(data_no_nan)
data_no_nan_clustered.head()


Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
6491,46,State-gov,353012,HS-grad,Married,Transport-moving,Family,White,Male,0,0,40,Developed,<=50K
22605,42,State-gov,136996,Some-college,Married,Protective-serv,Family,Black,Male,0,0,48,Developed,<=50K
16728,44,Self-emp-not-inc,90021,HS-grad,Married,Craft-repair,Family,White,Male,0,0,30,Developed,<=50K
29334,27,Private,89598,Some-college,Single,Adm-clerical,Not-in-Family,White,Female,0,0,60,Developed,<=50K
12217,57,Private,217886,HS-grad,Single,Adm-clerical,Not-in-Family,White,Female,0,0,36,Developed,<=50K


In [26]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [27]:
data_no_nan_clustered_and_preprocessed = preprocess_data(data_no_nan_clustered, numerical_features_list, categorical_features_list)
get_LR_performance(data_no_nan_clustered_and_preprocessed)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     18210
           1       0.74      0.61      0.67      5916

    accuracy                           0.85     24126
   macro avg       0.81      0.77      0.79     24126
weighted avg       0.84      0.85      0.85     24126



In [28]:
summary = logit_summary(data_no_nan_clustered_and_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.32790365332726595
            Iterations: 249
            Function evaluations: 251
            Gradient evaluations: 249


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,24126.0
Model:,Logit,Df Residuals:,24093.0
Method:,MLE,Df Model:,32.0
Date:,"Sat, 29 Apr 2023",Pseudo R-squ.:,0.4113
Time:,14:08:38,Log-Likelihood:,-7911.0
converged:,True,LL-Null:,-13439.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2374,0.010,24.268,0.000,0.218,0.257
stand scaler__Age,0.3386,0.023,14.510,0.000,0.293,0.384
stand scaler__final weight,0.0880,0.021,4.262,0.000,0.048,0.129
stand scaler__Capital Gain,2.3838,0.088,27.215,0.000,2.212,2.556
stand scaler__Capital Loss,0.2683,0.017,15.644,0.000,0.235,0.302
stand scaler__Hours per week,0.3776,0.022,16.820,0.000,0.334,0.422
onehot__Workclass_ Local-gov,-1.1310,0.118,-9.556,0.000,-1.363,-0.899
onehot__Workclass_ Private,-0.9605,0.096,-9.991,0.000,-1.149,-0.772
onehot__Workclass_ Self-emp-inc,-0.7307,0.134,-5.466,0.000,-0.993,-0.469


This approach to data preprocessing gave us the best result so far. We saved computational complexity while redused the dimentionality, but the performance stayed. However, Sex feature became insignifficant. It's still not a good model though
# 5th model
### Let's try to apply ln() function to 'Age', 'Capital Gain' and 'Capital Loss' festures (as they are heavy tailed) before Standard Scaler to normalize it

In [29]:
data_logged = data_train.copy()
data_logged = data_logged.dropna(how='any')

In [30]:
data_logged['Capital Gain'] = np.log(1+ data_logged['Capital Gain'])
data_logged['Capital Loss'] = np.log(1+ data_logged['Capital Loss'])
data_logged['Age'] = np.log(data_logged['Age'])

data_logged = cluster_categorical(data_logged)

In [31]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [32]:
data_logged_and_preprocessed = preprocess_data(data_logged, numerical_features_list, categorical_features_list)
get_LR_performance(data_logged_and_preprocessed)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     18210
           1       0.72      0.60      0.65      5916

    accuracy                           0.85     24126
   macro avg       0.80      0.76      0.78     24126
weighted avg       0.84      0.85      0.84     24126



In [33]:
summary = logit_summary(data_logged_and_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.33855214545296564
            Iterations: 241
            Function evaluations: 244
            Gradient evaluations: 241


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,24126.0
Model:,Logit,Df Residuals:,24094.0
Method:,MLE,Df Model:,31.0
Date:,"Sat, 29 Apr 2023",Pseudo R-squ.:,0.3922
Time:,14:08:40,Log-Likelihood:,-8167.9
converged:,True,LL-Null:,-13439.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2350,0.010,24.573,0.000,0.216,0.254
stand scaler__Age,0.4514,0.026,17.613,0.000,0.401,0.502
stand scaler__Capital Gain,0.5068,0.018,27.764,0.000,0.471,0.543
stand scaler__Capital Loss,0.2446,0.017,14.345,0.000,0.211,0.278
stand scaler__Hours per week,0.3700,0.022,16.824,0.000,0.327,0.413
onehot__Workclass_ Local-gov,-1.1962,0.116,-10.296,0.000,-1.424,-0.968
onehot__Workclass_ Private,-0.9614,0.094,-10.197,0.000,-1.146,-0.777
onehot__Workclass_ Self-emp-inc,-0.7418,0.131,-5.677,0.000,-0.998,-0.486
onehot__Workclass_ Self-emp-not-inc,-1.4083,0.114,-12.386,0.000,-1.631,-1.185


Not better either.

# 6th model
### Another try is to cluster 'Hours per week' feature to part-time, fulltime and overtime workers with fulltime value for 40 hours

In [34]:
data_hpw = data_train.copy()
data_hpw = data_hpw.dropna(how='any')
data_hpw['Hours per week'] = np.where(data_hpw['Hours per week'] == 40, 'fulltime', 
                                   (np.where(data_hpw['Hours per week'] < 40, 'part-time', 'overtime')))

data_hpw = cluster_categorical(data_hpw)

In [35]:
data_hpw.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
6491,46,State-gov,353012,HS-grad,Married,Transport-moving,Family,White,Male,0,0,fulltime,Developed,<=50K
22605,42,State-gov,136996,Some-college,Married,Protective-serv,Family,Black,Male,0,0,overtime,Developed,<=50K
16728,44,Self-emp-not-inc,90021,HS-grad,Married,Craft-repair,Family,White,Male,0,0,part-time,Developed,<=50K
29334,27,Private,89598,Some-college,Single,Adm-clerical,Not-in-Family,White,Female,0,0,overtime,Developed,<=50K
12217,57,Private,217886,HS-grad,Single,Adm-clerical,Not-in-Family,White,Female,0,0,part-time,Developed,<=50K


In [36]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Hours per week', 'Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [37]:
data_hpw_preprocessed = preprocess_data(data_hpw, numerical_features_list, categorical_features_list)
get_LR_performance(data_hpw_preprocessed)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     18210
           1       0.74      0.60      0.67      5916

    accuracy                           0.85     24126
   macro avg       0.81      0.77      0.78     24126
weighted avg       0.85      0.85      0.85     24126



In [38]:
summary = logit_summary(data_hpw_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3268951896631991
            Iterations: 253
            Function evaluations: 255
            Gradient evaluations: 253


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,24126.0
Model:,Logit,Df Residuals:,24093.0
Method:,MLE,Df Model:,32.0
Date:,"Sat, 29 Apr 2023",Pseudo R-squ.:,0.4131
Time:,14:08:42,Log-Likelihood:,-7886.7
converged:,True,LL-Null:,-13439.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2333,0.010,23.777,0.000,0.214,0.253
stand scaler__Age,0.3398,0.023,14.528,0.000,0.294,0.386
stand scaler__Capital Gain,2.3737,0.088,27.041,0.000,2.202,2.546
stand scaler__Capital Loss,0.2681,0.017,15.622,0.000,0.234,0.302
onehot__Hours per week_overtime,0.4939,0.045,11.020,0.000,0.406,0.582
onehot__Hours per week_part-time,-0.7596,0.069,-10.992,0.000,-0.895,-0.624
onehot__Workclass_ Local-gov,-1.0998,0.119,-9.269,0.000,-1.332,-0.867
onehot__Workclass_ Private,-0.9517,0.096,-9.863,0.000,-1.141,-0.763
onehot__Workclass_ Self-emp-inc,-0.6467,0.134,-4.828,0.000,-0.909,-0.384


Hours per week work great like this, both features are signifficant, but it did not affect model's accuracy and Sex feature became even more insignifficant compared to the previous model.

# 7th model
### Lets now try to cluster all minority categories of imbalanced features together

In [39]:
data_cluster_imbalanced = data_train.copy()

def balance_predictors(data):
    data['Workclass'] = np.where(data['Workclass'] != ' Private', 'Other', data['Workclass'])
    
    data.loc[
        lambda x: x["Marital Status"].isin([' Widowed', ' Separated', ' Married-spouse-absent', ' Never-married', ' Divorced']), "Marital Status"
    ] = "Single" 

    data.loc[
        lambda x: x["Occupation"].isin([' Craft-repair', ' Other-service', ' Priv-house-serv', ' Protective-serv']), "Occupation"
    ] = "Services"

    data.loc[
        lambda x: x["Relationship"].isin([' Husband', ' Wife', ' Own-child']), "Relationship"
    ] = "Family"
    data.loc[
        lambda x: x["Relationship"].isin([' Not-in-family', ' Unmarried', ' Other-relative']), "Relationship"
    ] = "Not-in-Family"
    
    data['Ethnic group'] = np.where(data['Ethnic group'] != ' White', 'Other', data['Ethnic group'])
    data['Hours per week'] = np.where(data['Hours per week'] == 40, 'fulltime', (np.where(data['Hours per week'] < 40, 'part-time', 'overtime')))
    data['Country'] = np.where(data['Country'] != ' United-States', 'Other', data['Country'])
    
balance_predictors(data_cluster_imbalanced)
data_cluster_imbalanced.sample(3)

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
19403,21,Private,175374,HS-grad,Single,Adm-clerical,Family,White,Male,0,0,fulltime,United-States,<=50K
30797,33,Private,138142,Some-college,Single,Services,Not-in-Family,Other,Female,0,0,part-time,United-States,<=50K
11967,76,Private,239880,HS-grad,Single,Adm-clerical,Not-in-Family,White,Female,0,0,part-time,United-States,<=50K


In [40]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex',  'Hours per week', 'Country']

In [41]:
data_cluster_imbalanced_preprocessed = preprocess_data(data_cluster_imbalanced, numerical_features_list, categorical_features_list)
get_LR_performance(data_cluster_imbalanced_preprocessed)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19871
           1       0.73      0.59      0.65      6177

    accuracy                           0.85     26048
   macro avg       0.81      0.76      0.78     26048
weighted avg       0.84      0.85      0.85     26048



In [42]:
summary = logit_summary(data_cluster_imbalanced_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3206303522992818
            Iterations: 176
            Function evaluations: 178
            Gradient evaluations: 176


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26024.0
Method:,MLE,Df Model:,23.0
Date:,"Sat, 29 Apr 2023",Pseudo R-squ.:,0.4147
Time:,14:08:43,Log-Likelihood:,-8351.8
converged:,True,LL-Null:,-14268.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2774,0.010,27.632,0.000,0.258,0.297
stand scaler__Age,0.3624,0.023,15.526,0.000,0.317,0.408
stand scaler__Capital Gain,2.3059,0.083,27.793,0.000,2.143,2.468
stand scaler__Capital Loss,0.2694,0.017,16.282,0.000,0.237,0.302
onehot__Workclass_Other,-0.0455,0.044,-1.023,0.306,-0.133,0.042
onehot__Marital Status_ Married-civ-spouse,-2.9694,0.121,-24.600,0.000,-3.206,-2.733
onehot__Marital Status_Single,-5.9693,0.165,-36.068,0.000,-6.294,-5.645
onehot__Occupation_ Armed-Forces,-0.9724,1.444,-0.674,0.501,-3.802,1.857
onehot__Occupation_ Exec-managerial,0.6390,0.083,7.743,0.000,0.477,0.801


**So far, we have checked 7 models, none of them gave any performance improvement, although we reached faster convergence. Let's now try to build the last model, that would contain all best ideas of 7 models above:**

# Final Model

In [43]:
data_final = data_train.copy()
data_final = data_final.dropna(how='any')

In [44]:
data_final['Occupation'].value_counts()

Occupation
 Exec-managerial      3226
 Craft-repair         3209
 Prof-specialty       3162
 Adm-clerical         3029
 Sales                2841
 Other-service        2591
 Machine-op-inspct    1585
 Transport-moving     1265
 Handlers-cleaners    1091
 Farming-fishing       779
 Tech-support          711
 Protective-serv       513
 Priv-house-serv       117
 Armed-Forces            7
Name: count, dtype: int64

In [45]:
data_final['Hours per week'] = np.where(data_final['Hours per week'] == 40, 'fulltime', 
                                   (np.where(data_final['Hours per week'] < 40, 'part-time', 'overtime')))

In [46]:
data_final = cluster_categorical(data_final)
data_final = data_final[~data_final['Workclass'].isin([' Never-worked', ' Without-pay'])]
data_final = data_final[~data_final['Occupation'].isin([' Armed-Forces'])]
data_final.loc[
        lambda x: x["Occupation"].isin([' Craft-repair', ' Other-service', ' Priv-house-serv', ' Protective-serv']), "Occupation"
    ] = "Services"

data_final.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
6491,46,State-gov,353012,HS-grad,Married,Transport-moving,Family,White,Male,0,0,fulltime,Developed,<=50K
22605,42,State-gov,136996,Some-college,Married,Services,Family,Black,Male,0,0,overtime,Developed,<=50K
16728,44,Self-emp-not-inc,90021,HS-grad,Married,Services,Family,White,Male,0,0,part-time,Developed,<=50K
29334,27,Private,89598,Some-college,Single,Adm-clerical,Not-in-Family,White,Female,0,0,overtime,Developed,<=50K
12217,57,Private,217886,HS-grad,Single,Adm-clerical,Not-in-Family,White,Female,0,0,part-time,Developed,<=50K


In [47]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status',  'Occupation',
                             'Relationship', 'Ethnic group', 'Hours per week', 'Country']

In [48]:
data_final_preprocessed = preprocess_data(data_final, numerical_features_list, categorical_features_list)
get_LR_performance(data_final_preprocessed)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     18192
           1       0.74      0.60      0.66      5915

    accuracy                           0.85     24107
   macro avg       0.81      0.77      0.78     24107
weighted avg       0.84      0.85      0.85     24107



In [49]:
summary = logit_summary(data_final_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3287103752724679
            Iterations: 169
            Function evaluations: 171
            Gradient evaluations: 169


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,24107.0
Model:,Logit,Df Residuals:,24079.0
Method:,MLE,Df Model:,27.0
Date:,"Sat, 29 Apr 2023",Pseudo R-squ.:,0.4101
Time:,14:08:44,Log-Likelihood:,-7924.2
converged:,True,LL-Null:,-13432.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2361,0.010,24.278,0.000,0.217,0.255
stand scaler__Age,0.3414,0.023,14.662,0.000,0.296,0.387
stand scaler__final weight,0.0919,0.021,4.458,0.000,0.052,0.132
stand scaler__Capital Gain,2.3798,0.088,27.087,0.000,2.208,2.552
stand scaler__Capital Loss,0.2691,0.017,15.693,0.000,0.236,0.303
onehot__Workclass_ Local-gov,-1.0061,0.117,-8.587,0.000,-1.236,-0.776
onehot__Workclass_ Private,-0.9842,0.097,-10.197,0.000,-1.173,-0.795
onehot__Workclass_ Self-emp-inc,-0.6635,0.134,-4.950,0.000,-0.926,-0.401
onehot__Workclass_ Self-emp-not-inc,-1.3417,0.117,-11.458,0.000,-1.571,-1.112


#### Even though the model performance has not improved compared to initial model, we were able to the reduce the number of iterations till convergence twice. Also we've build a model, where all features are statistically signifficant

### List of changes to initial data:
##### 1. Remove all missing data
##### 2. Remove classes, that are poorly represented, including Workclass: 'Never worked' and 'Without pay'; Occupeation: 'Armed Forces'
##### 3. Clustered most of categorical features to reduce dimentionality (from **83** in the initial model to **28** in the final model)