In [1]:
import pandas as pd
import numpy as np
from common_functions import cluster_categorical, preprocess_data
from statsmodels.api import Logit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

Here the results from 'Adult EDA" file are going to be used

In [2]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)
df = df.replace(to_replace= ' ?', value = np.nan)

In [3]:
df = df.drop(['Education-Num'], axis = 'columns')

In [4]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [5]:
for col in categorical_features_list:
    print(f'{col}\n{df[col].value_counts()}\n')

Workclass
Workclass
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: count, dtype: int64

Marital Status
Marital Status
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: count, dtype: int64

Occupation
Occupation
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: count, dtype: int64

Relationship
Relationship
 Hu

As in the whole dataset (X) we have only one 'Holand-Netherlands' value in 'Country' column, we have to process it separately, because in case if it appears in the test set, model will not be able to predict target for such a record. For the initial model, where there are no changes in data, this observation will be removed

In [6]:
print(df.shape)
df_no_nl = df.copy()
df_no_nl.drop(df_no_nl.loc[df['Country']==' Holand-Netherlands'].index, inplace=True)
print(df_no_nl.shape)

(32561, 14)
(32560, 14)


In [7]:
data_train, data_test = train_test_split(df_no_nl, test_size = 0.2)
print(data_train.shape)
data_train.head()

(26048, 14)


Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
25687,70,Private,195739,10th,Widowed,Craft-repair,Unmarried,White,Male,0,0,45,United-States,<=50K
6391,41,State-gov,47170,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Amer-Indian-Eskimo,Female,0,0,48,United-States,>50K
8386,29,Private,204862,Assoc-acdm,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
16224,28,Self-emp-not-inc,54683,Some-college,Never-married,Craft-repair,Not-in-family,White,Male,0,1590,40,United-States,<=50K
18011,44,Local-gov,387770,Some-college,Widowed,Adm-clerical,Unmarried,White,Female,0,0,15,United-States,<=50K


In this dataset we have only one feature, where the order matters - Education, so it will be transformed with using OrdinalEncoder. For all the rest of categorical features the order does not matter, hense we can apply OneHotEncoder() to them.

# 1st model 
### Inital model without changes in data

In [8]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [9]:
data_no_transform = data_train.copy()
TARGET = 'Income'
preprocesseded_data = preprocess_data(data_no_transform, 
                                        TARGET, 
                                        numerical_features_list, 
                                        categorical_features_list, 
                                        ordinal_feature= 'Education', 
                                        order_of_categories=[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                            ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                            ' Bachelors',' Masters',' Prof-school',' Doctorate'])
preprocesseded_data.sample()

Unnamed: 0,Income,ordinal__Education,stand scaler__Age,stand scaler__final weight,stand scaler__Capital Gain,stand scaler__Capital Loss,stand scaler__Hours per week,onehot__Workclass_ Local-gov,onehot__Workclass_ Never-worked,onehot__Workclass_ Private,...,onehot__Country_ Puerto-Rico,onehot__Country_ Scotland,onehot__Country_ South,onehot__Country_ Taiwan,onehot__Country_ Thailand,onehot__Country_ Trinadad&Tobago,onehot__Country_ United-States,onehot__Country_ Vietnam,onehot__Country_ Yugoslavia,onehot__Country_nan
10304,1,12.0,0.318495,0.984841,-0.145094,4.492499,-0.033665,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [10]:
# A function, that performs all needed data preparation and feeds it to LogisticRegression


def get_LR_performance(data: pd.DataFrame, target: str = 'Income'):
    """The function performs data preprocessing, creates pipeline with LogisticRegression model, 
        and prints it's performance out

    Args:
        data: full dataset with features and target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder
    """
    X = data.drop(columns=[target])
    y = data[target]

    model = LogisticRegression(max_iter = 10000)
    model.fit(X, y)
    y_pred = model.predict(X)
    
    scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    f1_mean_score = round(np.mean(scores),2)
    f1_std = round(np.std(scores),2)
    
    report = classification_report(y, y_pred)
   
    print(f'f1 score: mean = {f1_mean_score} | std = {f1_std}')
    print(report)


In [11]:
get_LR_performance(preprocesseded_data)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19777
           1       0.74      0.61      0.67      6271

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.79     26048
weighted avg       0.85      0.85      0.85     26048



### Let's now understand significance of features with the help of Logit() function from statsmodel

In [12]:
def logit_summary(data: pd.DataFrame, target: str = 'Income'):
    """ Function applies Logit() function to already preprocessed datan. After that retuns summary which contains featues significances

    Args:
        data (pd.DataFrame): DataFrame of features and target

    Returns:
        Summary: summary of statsmodel Logit() model with the help of which the decision about 
                keeping or modifying/removing a feature can be made
    """

    X = data.drop(columns = [target])
    y = data[target]

    model = Logit(y, X).fit_regularized()
    summary = model.summary()
    
    return summary

In [13]:
summary = logit_summary(preprocesseded_data)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3170621757556953
            Iterations: 586
            Function evaluations: 589
            Gradient evaluations: 586


0,1,2,3
Dep. Variable:,Income,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25964.0
Method:,MLE,Df Model:,83.0
Date:,"Sun, 16 Jul 2023",Pseudo R-squ.:,0.4256
Time:,18:37:53,Log-Likelihood:,-8258.8
converged:,True,LL-Null:,-14377.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2831,0.010,27.484,0.000,0.263,0.303
stand scaler__Age,0.3589,0.025,14.527,0.000,0.311,0.407
stand scaler__final weight,0.0828,0.020,4.073,0.000,0.043,0.123
stand scaler__Capital Gain,2.3542,0.085,27.644,0.000,2.187,2.521
stand scaler__Capital Loss,0.2526,0.017,15.138,0.000,0.220,0.285
stand scaler__Hours per week,0.3761,0.022,16.898,0.000,0.332,0.420
onehot__Workclass_ Local-gov,-0.7234,0.125,-5.768,0.000,-0.969,-0.478
onehot__Workclass_ Never-worked,-0.2259,8.39e+06,-2.69e-08,1.000,-1.64e+07,1.64e+07
onehot__Workclass_ Private,-0.5762,0.104,-5.515,0.000,-0.781,-0.371


# 2nd model
### Same model, but without 'final weight'

As we remember from EDA, **'final weight'** feature did not pass the significance border. Let's try to remove it and check the performance

In [14]:
preprocesseded_data.columns[:5]

Index(['Income', 'ordinal__Education', 'stand scaler__Age',
       'stand scaler__final weight', 'stand scaler__Capital Gain'],
      dtype='object')

In [15]:
# to save some computational complexity and because we use the same dataset, we can just remove the 'final weight' feature right from the prepdossed_data df:

preprocesseded_data_no_fw = preprocesseded_data.copy().drop(['stand scaler__final weight'], axis = 'columns')
get_LR_performance(preprocesseded_data_no_fw)

f1 score: mean = 0.79 | std = 0.01
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19777
           1       0.74      0.61      0.67      6271

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.79     26048
weighted avg       0.85      0.85      0.85     26048



#### Performance in general has not changed, let's check if features' significances have changed

In [16]:
summary = logit_summary(preprocesseded_data_no_fw)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3173787870166672
            Iterations: 596
            Function evaluations: 598
            Gradient evaluations: 596


0,1,2,3
Dep. Variable:,Income,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25965.0
Method:,MLE,Df Model:,82.0
Date:,"Sun, 16 Jul 2023",Pseudo R-squ.:,0.425
Time:,18:38:11,Log-Likelihood:,-8267.1
converged:,True,LL-Null:,-14377.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2830,0.010,27.482,0.000,0.263,0.303
stand scaler__Age,0.3531,0.025,14.328,0.000,0.305,0.401
stand scaler__Capital Gain,2.3564,0.085,27.678,0.000,2.190,2.523
stand scaler__Capital Loss,0.2520,0.017,15.124,0.000,0.219,0.285
stand scaler__Hours per week,0.3737,0.022,16.804,0.000,0.330,0.417
onehot__Workclass_ Local-gov,-0.7195,0.125,-5.742,0.000,-0.965,-0.474
onehot__Workclass_ Never-worked,-0.2172,5.66e+06,-3.83e-08,1.000,-1.11e+07,1.11e+07
onehot__Workclass_ Private,-0.5701,0.104,-5.463,0.000,-0.775,-0.366
onehot__Workclass_ Self-emp-inc,-0.4101,0.137,-2.994,0.003,-0.679,-0.142


According to Logit() results, all of numerical features are statistically significant. Some categoties in a couple of categotical features have to be clustered as they are insignificant. 

Assumption 1. Workclasses representatives, that do not work or work without pay will have less than 50k, so can become one cluster.

Assumption 2. Single people tend to earn more, as they have more free time for career development; so values of Marital Status feature can be clustered to Sigle and Married 

Assumption 3. Occupation has no impact on Income, as all categories are insignificant, so could be removed from the model. But before, they will be left like this, as from the EDA we saw that this feature is significant

Assumption 4. All categories of Relationship and Sex features are significant.

Assumption 5. Most of Ethnic Groups have no impact on target, it's possible to cluster them according to their inbalancy: white and others

# 3rd model
### Clustering categories of features


In [17]:
data_clustered = data_train.copy()

data_clustered = cluster_categorical(data_clustered)

In [18]:
data_clustered.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
25687,70,Private,195739,10th,Single,Craft-repair,Not-in-Family,White,Male,0,0,45,Developing,<=50K
6391,41,State-gov,47170,Bachelors,Single,Prof-specialty,Not-in-Family,Amer-Indian-Eskimo,Female,0,0,48,Developing,>50K
8386,29,Private,204862,Assoc-acdm,Single,Prof-specialty,Not-in-Family,White,Female,0,0,36,Developing,<=50K
16224,28,Self-emp-not-inc,54683,Some-college,Single,Craft-repair,Not-in-Family,White,Male,0,1590,40,Developing,<=50K
18011,44,Local-gov,387770,Some-college,Single,Adm-clerical,Not-in-Family,White,Female,0,0,15,Developing,<=50K


Let's now apply the pipeline to updated dataset

In [19]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [20]:
data_clustered_and_preprocessed = preprocess_data(data_clustered, 
                                                TARGET, 
                                                numerical_features_list, 
                                                categorical_features_list, 
                                                ordinal_feature= 'Education', 
                                                order_of_categories=[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                                    ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                                    ' Bachelors',' Masters',' Prof-school',' Doctorate'])

get_LR_performance(data_clustered_and_preprocessed)

f1 score: mean = 0.74 | std = 0.01
              precision    recall  f1-score   support

           0       0.85      0.94      0.90     19777
           1       0.73      0.49      0.58      6271

    accuracy                           0.83     26048
   macro avg       0.79      0.71      0.74     26048
weighted avg       0.82      0.83      0.82     26048



In [21]:
summary = logit_summary(data_clustered_and_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3840195715946806
            Iterations: 259
            Function evaluations: 262
            Gradient evaluations: 259


0,1,2,3
Dep. Variable:,Income,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26016.0
Method:,MLE,Df Model:,31.0
Date:,"Sun, 16 Jul 2023",Pseudo R-squ.:,0.3042
Time:,18:38:18,Log-Likelihood:,-10003.0
converged:,True,LL-Null:,-14377.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.1845,0.008,22.213,0.000,0.168,0.201
stand scaler__Age,0.5520,0.020,27.563,0.000,0.513,0.591
stand scaler__Capital Gain,2.4177,0.084,28.782,0.000,2.253,2.582
stand scaler__Capital Loss,0.2671,0.015,17.749,0.000,0.238,0.297
stand scaler__Hours per week,0.3965,0.020,19.932,0.000,0.358,0.435
onehot__Workclass_ Local-gov,-1.2755,0.105,-12.186,0.000,-1.481,-1.070
onehot__Workclass_ Never-worked,-10.8128,2.06e+07,-5.26e-07,1.000,-4.03e+07,4.03e+07
onehot__Workclass_ Private,-1.2237,0.084,-14.562,0.000,-1.388,-1.059
onehot__Workclass_ Self-emp-inc,-0.9087,0.118,-7.679,0.000,-1.141,-0.677


1. Workclass 'Without pay' is still innsignificant, will try to remove these instances (there is a small amount of them)
2. Some Occupations are insignifficant
3. Relationships became signifficant
4. Sex became insignifficant
5. All NaNs are insignifficant

Let's check unique values of Ocupation:

In [22]:
data_clustered['Occupation'].value_counts()

Occupation
 Prof-specialty       3313
 Craft-repair         3271
 Exec-managerial      3265
 Adm-clerical         2997
 Sales                2909
 Other-service        2634
 Machine-op-inspct    1617
 Transport-moving     1279
 Handlers-cleaners    1095
 Farming-fishing       783
 Tech-support          748
 Protective-serv       518
 Priv-house-serv       126
 Armed-Forces            7
Name: count, dtype: int64

Armed Forces is the smallest group, but is insignifficant in all configurations of the model. Let's try to remove it later

# 4th model

Let's try to remove missing data

In [23]:
data_no_nan = data_train.copy()
data_no_nan = data_no_nan.dropna(how='any')

In [24]:
data_no_nan.shape

(24106, 14)

In [25]:
data_no_nan_clustered = cluster_categorical(data_no_nan)
data_no_nan_clustered.head()


Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
25687,70,Private,195739,10th,Single,Craft-repair,Not-in-Family,White,Male,0,0,45,Developing,<=50K
6391,41,State-gov,47170,Bachelors,Single,Prof-specialty,Not-in-Family,Amer-Indian-Eskimo,Female,0,0,48,Developing,>50K
8386,29,Private,204862,Assoc-acdm,Single,Prof-specialty,Not-in-Family,White,Female,0,0,36,Developing,<=50K
16224,28,Self-emp-not-inc,54683,Some-college,Single,Craft-repair,Not-in-Family,White,Male,0,1590,40,Developing,<=50K
18011,44,Local-gov,387770,Some-college,Single,Adm-clerical,Not-in-Family,White,Female,0,0,15,Developing,<=50K


In [26]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [27]:
data_no_nan_clustered_and_preprocessed = preprocess_data(data_no_nan_clustered, 
                                                        TARGET, 
                                                        numerical_features_list, 
                                                        categorical_features_list, 
                                                        ordinal_feature= 'Education', 
                                                        order_of_categories=[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                                        ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                                        ' Bachelors',' Masters',' Prof-school',' Doctorate']
                                                        )
get_LR_performance(data_no_nan_clustered_and_preprocessed)

f1 score: mean = 0.74 | std = 0.01
              precision    recall  f1-score   support

           0       0.85      0.94      0.89     18098
           1       0.73      0.49      0.59      6008

    accuracy                           0.83     24106
   macro avg       0.79      0.72      0.74     24106
weighted avg       0.82      0.83      0.82     24106



In [28]:
summary = logit_summary(data_no_nan_clustered_and_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.39219190439107693
            Iterations: 227
            Function evaluations: 230
            Gradient evaluations: 227


0,1,2,3
Dep. Variable:,Income,No. Observations:,24106.0
Model:,Logit,Df Residuals:,24076.0
Method:,MLE,Df Model:,29.0
Date:,"Sun, 16 Jul 2023",Pseudo R-squ.:,0.3015
Time:,18:38:32,Log-Likelihood:,-9454.2
converged:,True,LL-Null:,-13535.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.1819,0.009,20.961,0.000,0.165,0.199
stand scaler__Age,0.5377,0.020,26.582,0.000,0.498,0.577
stand scaler__final weight,0.0784,0.019,4.203,0.000,0.042,0.115
stand scaler__Capital Gain,2.4706,0.089,27.823,0.000,2.297,2.645
stand scaler__Capital Loss,0.2671,0.016,17.103,0.000,0.236,0.298
stand scaler__Hours per week,0.3940,0.020,19.398,0.000,0.354,0.434
onehot__Workclass_ Local-gov,-1.2216,0.106,-11.545,0.000,-1.429,-1.014
onehot__Workclass_ Private,-1.1765,0.085,-13.806,0.000,-1.344,-1.010
onehot__Workclass_ Self-emp-inc,-0.8599,0.121,-7.130,0.000,-1.096,-0.624


This approach to data preprocessing gave us the best result so far. We saved computational complexity while redused the dimentionality, but the performance stayed. However, Sex feature became insignifficant. It's still not a good model though
# 5th model
### Let's try to apply ln() function to 'Age', 'Capital Gain' and 'Capital Loss' festures (as they are heavy tailed) before Standard Scaler to normalize it

In [29]:
data_logged = data_train.copy()
data_logged = data_logged.dropna(how='any')

In [30]:
data_logged['Capital Gain'] = np.log(1+ data_logged['Capital Gain'])
data_logged['Capital Loss'] = np.log(1+ data_logged['Capital Loss'])
data_logged['Age'] = np.log(data_logged['Age'])

data_logged = cluster_categorical(data_logged)

In [31]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [32]:
data_logged_and_preprocessed = preprocess_data(data_logged,
                                               TARGET, 
                                               numerical_features_list, 
                                               categorical_features_list, 
                                               ordinal_feature= 'Education', 
                                               order_of_categories=[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                               ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                               ' Bachelors',' Masters',' Prof-school',' Doctorate'])
get_LR_performance(data_logged_and_preprocessed)

f1 score: mean = 0.73 | std = 0.0
              precision    recall  f1-score   support

           0       0.85      0.93      0.89     18098
           1       0.71      0.50      0.58      6008

    accuracy                           0.82     24106
   macro avg       0.78      0.71      0.74     24106
weighted avg       0.81      0.82      0.81     24106



In [33]:
summary = logit_summary(data_logged_and_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.4002461758031343
            Iterations: 239
            Function evaluations: 242
            Gradient evaluations: 239


0,1,2,3
Dep. Variable:,Income,No. Observations:,24106.0
Model:,Logit,Df Residuals:,24077.0
Method:,MLE,Df Model:,28.0
Date:,"Sun, 16 Jul 2023",Pseudo R-squ.:,0.2872
Time:,18:38:46,Log-Likelihood:,-9648.3
converged:,True,LL-Null:,-13535.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.1799,0.009,21.013,0.000,0.163,0.197
stand scaler__Age,0.6745,0.022,30.346,0.000,0.631,0.718
stand scaler__Capital Gain,0.4968,0.017,29.704,0.000,0.464,0.530
stand scaler__Capital Loss,0.2448,0.016,15.788,0.000,0.214,0.275
stand scaler__Hours per week,0.3751,0.020,18.695,0.000,0.336,0.414
onehot__Workclass_ Local-gov,-1.2802,0.106,-12.128,0.000,-1.487,-1.073
onehot__Workclass_ Private,-1.1767,0.085,-13.800,0.000,-1.344,-1.010
onehot__Workclass_ Self-emp-inc,-0.8548,0.119,-7.169,0.000,-1.089,-0.621
onehot__Workclass_ Self-emp-not-inc,-1.5034,0.105,-14.369,0.000,-1.708,-1.298


Not better either.

# 6th model
### Another try is to cluster 'Hours per week' feature to part-time, fulltime and overtime workers with fulltime value for 40 hours

In [34]:
data_hpw = data_train.copy()
data_hpw = data_hpw.dropna(how='any')
data_hpw['Hours per week'] = np.where(data_hpw['Hours per week'] == 40, 'fulltime', 
                                   (np.where(data_hpw['Hours per week'] < 40, 'part-time', 'overtime')))

data_hpw = cluster_categorical(data_hpw)

In [35]:
data_hpw.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
25687,70,Private,195739,10th,Single,Craft-repair,Not-in-Family,White,Male,0,0,overtime,Developing,<=50K
6391,41,State-gov,47170,Bachelors,Single,Prof-specialty,Not-in-Family,Amer-Indian-Eskimo,Female,0,0,overtime,Developing,>50K
8386,29,Private,204862,Assoc-acdm,Single,Prof-specialty,Not-in-Family,White,Female,0,0,part-time,Developing,<=50K
16224,28,Self-emp-not-inc,54683,Some-college,Single,Craft-repair,Not-in-Family,White,Male,0,1590,fulltime,Developing,<=50K
18011,44,Local-gov,387770,Some-college,Single,Adm-clerical,Not-in-Family,White,Female,0,0,part-time,Developing,<=50K


In [36]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Hours per week', 'Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [37]:
data_hpw_preprocessed = preprocess_data(data_hpw,
                                        TARGET, 
                                        numerical_features_list, 
                                        categorical_features_list, 
                                        ordinal_feature= 'Education', 
                                        order_of_categories=[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                            ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                            ' Bachelors',' Masters',' Prof-school',' Doctorate'])
get_LR_performance(data_hpw_preprocessed)

f1 score: mean = 0.74 | std = 0.0
              precision    recall  f1-score   support

           0       0.85      0.94      0.89     18098
           1       0.73      0.50      0.59      6008

    accuracy                           0.83     24106
   macro avg       0.79      0.72      0.74     24106
weighted avg       0.82      0.83      0.82     24106



In [38]:
summary = logit_summary(data_hpw_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3906131745955655
            Iterations: 226
            Function evaluations: 228
            Gradient evaluations: 226


0,1,2,3
Dep. Variable:,Income,No. Observations:,24106.0
Model:,Logit,Df Residuals:,24076.0
Method:,MLE,Df Model:,29.0
Date:,"Sun, 16 Jul 2023",Pseudo R-squ.:,0.3043
Time:,18:38:56,Log-Likelihood:,-9416.1
converged:,True,LL-Null:,-13535.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.1791,0.009,20.584,0.000,0.162,0.196
stand scaler__Age,0.5361,0.020,26.418,0.000,0.496,0.576
stand scaler__Capital Gain,2.4669,0.089,27.687,0.000,2.292,2.642
stand scaler__Capital Loss,0.2652,0.016,16.948,0.000,0.234,0.296
onehot__Hours per week_overtime,0.4930,0.041,12.094,0.000,0.413,0.573
onehot__Hours per week_part-time,-0.8115,0.062,-13.022,0.000,-0.934,-0.689
onehot__Workclass_ Local-gov,-1.1864,0.106,-11.170,0.000,-1.395,-0.978
onehot__Workclass_ Private,-1.1516,0.086,-13.448,0.000,-1.319,-0.984
onehot__Workclass_ Self-emp-inc,-0.7559,0.121,-6.256,0.000,-0.993,-0.519


Hours per week work great like this, both features are signifficant, but it did not affect model's accuracy and Sex feature became even more insignifficant compared to the previous model.

# 7th model
### Lets now try to cluster all minority categories of imbalanced features together

In [39]:
data_cluster_imbalanced = data_train.copy()

def balance_predictors(data):
    data['Workclass'] = np.where(data['Workclass'] != ' Private', 'Other', data['Workclass'])
    
    data.loc[
        lambda x: x["Marital Status"].isin([' Widowed', ' Separated', ' Married-spouse-absent', ' Never-married', ' Divorced']), "Marital Status"
    ] = "Single" 

    data.loc[
        lambda x: x["Occupation"].isin([' Craft-repair', ' Other-service', ' Priv-house-serv', ' Protective-serv']), "Occupation"
    ] = "Services"

    data.loc[
        lambda x: x["Relationship"].isin([' Husband', ' Wife', ' Own-child']), "Relationship"
    ] = "Family"
    data.loc[
        lambda x: x["Relationship"].isin([' Not-in-family', ' Unmarried', ' Other-relative']), "Relationship"
    ] = "Not-in-Family"
    
    data['Ethnic group'] = np.where(data['Ethnic group'] != ' White', 'Other', data['Ethnic group'])
    data['Hours per week'] = np.where(data['Hours per week'] == 40, 'fulltime', (np.where(data['Hours per week'] < 40, 'part-time', 'overtime')))
    data['Country'] = np.where(data['Country'] != ' United-States', 'Other', data['Country'])
    
balance_predictors(data_cluster_imbalanced)
data_cluster_imbalanced.sample(3)

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
28680,45,Private,306889,Bachelors,Married-civ-spouse,Exec-managerial,Family,White,Female,0,0,fulltime,United-States,>50K
2457,39,Private,105813,HS-grad,Married-civ-spouse,Services,Family,White,Male,0,0,fulltime,United-States,<=50K
15283,54,Other,109413,Bachelors,Married-civ-spouse,Farming-fishing,Family,White,Male,0,0,overtime,United-States,<=50K


In [40]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex',  'Hours per week', 'Country']

In [41]:
data_cluster_imbalanced_preprocessed = preprocess_data(data_cluster_imbalanced,
                                                        TARGET, 
                                                        numerical_features_list, 
                                                        categorical_features_list, 
                                                        ordinal_feature= 'Education', 
                                                        order_of_categories=[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                                            ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                                            ' Bachelors',' Masters',' Prof-school',' Doctorate'])
get_LR_performance(data_cluster_imbalanced_preprocessed)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     19777
           1       0.73      0.59      0.65      6271

    accuracy                           0.85     26048
   macro avg       0.81      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048



In [42]:
summary = logit_summary(data_cluster_imbalanced_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3246501963933025
            Iterations: 177
            Function evaluations: 179
            Gradient evaluations: 177


0,1,2,3
Dep. Variable:,Income,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26024.0
Method:,MLE,Df Model:,23.0
Date:,"Sun, 16 Jul 2023",Pseudo R-squ.:,0.4118
Time:,18:39:03,Log-Likelihood:,-8456.5
converged:,True,LL-Null:,-14377.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2832,0.010,28.366,0.000,0.264,0.303
stand scaler__Age,0.3862,0.023,16.669,0.000,0.341,0.432
stand scaler__Capital Gain,2.3560,0.085,27.860,0.000,2.190,2.522
stand scaler__Capital Loss,0.2557,0.016,15.523,0.000,0.223,0.288
onehot__Workclass_Other,-0.0561,0.044,-1.266,0.206,-0.143,0.031
onehot__Marital Status_ Married-civ-spouse,-3.0054,0.120,-25.043,0.000,-3.241,-2.770
onehot__Marital Status_Single,-6.0614,0.168,-36.062,0.000,-6.391,-5.732
onehot__Occupation_ Armed-Forces,-1.0523,1.407,-0.748,0.455,-3.811,1.706
onehot__Occupation_ Exec-managerial,0.5901,0.082,7.183,0.000,0.429,0.751


**So far, we have checked 7 models, none of them gave any performance improvement, although we reached faster convergence. Let's now try to build the last model, that would contain all best ideas of 7 models above:**

# Final Model

In [43]:
data_final = data_train.copy()
data_final = data_final.dropna(how='any')

In [44]:
data_final['Occupation'].value_counts()

Occupation
 Prof-specialty       3225
 Craft-repair         3216
 Exec-managerial      3207
 Adm-clerical         2955
 Sales                2856
 Other-service        2567
 Machine-op-inspct    1586
 Transport-moving     1258
 Handlers-cleaners    1080
 Farming-fishing       778
 Tech-support          736
 Protective-serv       515
 Priv-house-serv       120
 Armed-Forces            7
Name: count, dtype: int64

In [45]:
data_final['Hours per week'] = np.where(data_final['Hours per week'] == 40, 'fulltime', 
                                   (np.where(data_final['Hours per week'] < 40, 'part-time', 'overtime')))

In [46]:
data_final = cluster_categorical(data_final)
data_final = data_final[~data_final['Workclass'].isin([' Never-worked', ' Without-pay'])]
data_final = data_final[~data_final['Occupation'].isin([' Armed-Forces'])]
data_final.loc[
        lambda x: x["Occupation"].isin([' Craft-repair', ' Other-service', ' Priv-house-serv', ' Protective-serv']), "Occupation"
    ] = "Services"

data_final.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
25687,70,Private,195739,10th,Single,Services,Not-in-Family,White,Male,0,0,overtime,Developing,<=50K
6391,41,State-gov,47170,Bachelors,Single,Prof-specialty,Not-in-Family,Amer-Indian-Eskimo,Female,0,0,overtime,Developing,>50K
8386,29,Private,204862,Assoc-acdm,Single,Prof-specialty,Not-in-Family,White,Female,0,0,part-time,Developing,<=50K
16224,28,Self-emp-not-inc,54683,Some-college,Single,Services,Not-in-Family,White,Male,0,1590,fulltime,Developing,<=50K
18011,44,Local-gov,387770,Some-college,Single,Adm-clerical,Not-in-Family,White,Female,0,0,part-time,Developing,<=50K


In [47]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status',  'Occupation',
                             'Relationship', 'Ethnic group', 'Hours per week', 'Country']

In [48]:
data_final_preprocessed = preprocess_data(data_final, 
                                            TARGET, 
                                            numerical_features_list, 
                                            categorical_features_list, 
                                            ordinal_feature= 'Education', 
                                            order_of_categories=[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                                ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                                ' Bachelors',' Masters',' Prof-school',' Doctorate'])
get_LR_performance(data_final_preprocessed)

f1 score: mean = 0.72 | std = 0.01
              precision    recall  f1-score   support

           0       0.84      0.94      0.89     18079
           1       0.72      0.46      0.56      6007

    accuracy                           0.82     24086
   macro avg       0.78      0.70      0.72     24086
weighted avg       0.81      0.82      0.81     24086



In [49]:
summary = logit_summary(data_final_preprocessed)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.4047791455965524
            Iterations: 140
            Function evaluations: 142
            Gradient evaluations: 140


0,1,2,3
Dep. Variable:,Income,No. Observations:,24086.0
Model:,Logit,Df Residuals:,24061.0
Method:,MLE,Df Model:,24.0
Date:,"Sun, 16 Jul 2023",Pseudo R-squ.:,0.2793
Time:,18:39:12,Log-Likelihood:,-9749.5
converged:,True,LL-Null:,-13528.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.1980,0.009,23.140,0.000,0.181,0.215
stand scaler__Age,0.5571,0.020,28.031,0.000,0.518,0.596
stand scaler__final weight,0.0948,0.018,5.184,0.000,0.059,0.131
stand scaler__Capital Gain,2.5070,0.089,28.166,0.000,2.333,2.682
stand scaler__Capital Loss,0.2729,0.015,17.741,0.000,0.243,0.303
onehot__Workclass_ Local-gov,-1.2027,0.103,-11.643,0.000,-1.405,-1.000
onehot__Workclass_ Private,-1.1874,0.084,-14.173,0.000,-1.352,-1.023
onehot__Workclass_ Self-emp-inc,-0.6410,0.119,-5.397,0.000,-0.874,-0.408
onehot__Workclass_ Self-emp-not-inc,-1.2972,0.105,-12.412,0.000,-1.502,-1.092


#### Even though the model performance has not improved compared to initial model, we were able to the reduce the number of iterations till convergence twice. Also we've build a model, where all features are statistically signifficant

### List of changes to initial data:
##### 1. Remove all missing data
##### 2. Remove classes, that are poorly represented, including Workclass: 'Never worked' and 'Without pay'; Occupeation: 'Armed Forces'
##### 3. Clustered most of categorical features to reduce dimentionality (from **83** in the initial model to **28** in the final model)