In [154]:
import pandas as pd
import numpy as np
import scipy.sparse as sps
from statsmodels.api import Logit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.linear_model import LogisticRegression
import sklearn

Here the results from 'Adult EDA" file are going to be used

In [155]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)
df = df.replace(to_replace= ' ?', value = np.nan)

In [156]:
df = df.drop(['Education-Num'], axis = 'columns')

In [157]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [158]:
for col in categorical_features_list:
    print(f'{col}\n{df[col].value_counts()}\n')

Workclass
Workclass
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: count, dtype: int64

Marital Status
Marital Status
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: count, dtype: int64

Occupation
Occupation
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: count, dtype: int64

Relationship
Relationship
 Hu

As in the whole dataset (X) we have only one 'Holand-Netherlands' value in 'Country' column, we have to process it separately, because in case if it appears in the test set, model will not be able to predict target for such a record. For the initial model, where there are no changes in data, this observation will be removed

In [159]:
print(df.shape)
df_no_nl = df.copy()
df_no_nl.drop(df_no_nl.loc[df['Country']==' Holand-Netherlands'].index, inplace=True)
print(df_no_nl.shape)

(32561, 14)
(32560, 14)


In [160]:
X = df_no_nl.drop(['Income'], axis = 'columns')
y = df_no_nl['Income']

In [161]:
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.2)

## 1. Features preprocessing

First, all variables have to be transformed to numerical format to feed them to LogisticRegression function:

In [162]:
X_train = X.copy()
y_train = y.copy()

data_train = pd.merge(left=y_train, right=X_train, left_index=True, right_index=True)
data_train.shape

(26048, 14)

In [163]:
data_train.head()

Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
6293,<=50K,37,Private,333651,HS-grad,Never-married,Adm-clerical,Not-in-family,White,Male,0,0,42,United-States
17719,<=50K,22,Private,237720,HS-grad,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States
18084,<=50K,34,Private,113198,Assoc-acdm,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,28,United-States
22008,>50K,42,Private,303155,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States
23423,>50K,30,Private,19302,Some-college,Married-civ-spouse,Prof-specialty,Husband,White,Male,7688,0,40,United-States


In this dataset we have only one feature, where the order matters - Education, so it will be transformed with using OrdinalEncoder. For all the rest of categorical features the order does not matter, hense we can apply OneHotEncoder() to them.

# 1st model 
### Inital model without changes in data

In [164]:
# A function, that performs all needed data preparation and feeds it to LogisticRegression

def get_LR_performance(data, numerical_features_list: list, categorical_features_list:list):
    """The function performs data preprocessing, creates pipeline with LogisticRegression model, 
        and prints it's performance out

    Args:
        data: full dataset with features and target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder
    """

    X = data.drop(columns=['Income'])
    y = data["Income"]

    columntransformer = ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
                                make_column_selector(pattern = 'Education')),
    ('stand scaler', StandardScaler(), numerical_features_list),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='drop')
    
    pipe = make_pipeline(columntransformer, LogisticRegression(max_iter=10000)).fit(X, y)

    y_pred = pipe.predict(X)
    
    scores = cross_val_score(pipe, X, y, cv=5, scoring='f1_macro')
    f1_mean_score = round(np.mean(scores),2)
    f1_std = round(np.std(scores),2)
    
    report = classification_report(y, y_pred, target_names=data['Income'].unique())
   
    print(f'f1 score: mean = {f1_mean_score} | std = {f1_std}')
    print(report)


In [165]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [166]:
get_LR_performance(data_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     19773
        >50K       0.73      0.60      0.66      6275

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.78     26048
weighted avg       0.85      0.85      0.85     26048



### Let's now understand significance of features with the help of Logit() function from statsmodel

In [167]:
def logit_summary(data, numerical_features: list, categorical_features: list):
    """Function performs data preprocessing and applies Logit() function. After that retuns summary which contains featues significances

    Args:
        X (Series object): X_train DataFrame of features
        y (array): y_train - target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder

    Returns:
        Summary: summary of statsmodel Logit() model with the help of which the decision about 
                keeping or modifying/removing a feature can be made
    """

    X = data.drop(columns=['Income'])
    y = data["Income"]

    column_transformer = ColumnTransformer(transformers = [
        ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                          ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                          ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
         make_column_selector(pattern = 'Education')),
        ('stand_scaler', StandardScaler(), numerical_features),
        ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features)],
        remainder='drop')
    
    X_trans = column_transformer.fit_transform(X)
    
    if sps.issparse(X_trans):
        X_trans = X_trans.toarray()
        
    x_columns_names = column_transformer.get_feature_names_out()
    X_trans = pd.DataFrame(X_trans, columns = x_columns_names)
    
    y_train_df = pd.DataFrame(y)
    onehot = OneHotEncoder(dtype='int', drop='first')
    y_trans = onehot.fit_transform(y_train_df)
    y_column_name = onehot.get_feature_names_out()
    y_trans = pd.DataFrame.sparse.from_spmatrix(y_trans, columns=y_column_name)
    
    model = Logit(y_trans, X_trans).fit_regularized()
    summary = model.summary()
    
    return summary

In [168]:
summary = logit_summary(data_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.31807119530182004
            Iterations: 600
            Function evaluations: 602
            Gradient evaluations: 600


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25964.0
Method:,MLE,Df Model:,83.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,0.4239
Time:,17:29:01,Log-Likelihood:,-8285.1
converged:,True,LL-Null:,-14382.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2677,0.010,26.248,0.000,0.248,0.288
stand_scaler__Age,0.3516,0.025,14.179,0.000,0.303,0.400
stand_scaler__final weight,0.0660,0.020,3.266,0.001,0.026,0.106
stand_scaler__Capital Gain,2.2751,0.082,27.634,0.000,2.114,2.437
stand_scaler__Capital Loss,0.2546,0.017,15.267,0.000,0.222,0.287
stand_scaler__Hours per week,0.3868,0.022,17.360,0.000,0.343,0.430
onehot__Workclass_ Local-gov,-0.8203,0.125,-6.582,0.000,-1.065,-0.576
onehot__Workclass_ Never-worked,-1.6430,4.75e+06,-3.46e-07,1.000,-9.32e+06,9.32e+06
onehot__Workclass_ Private,-0.6032,0.104,-5.827,0.000,-0.806,-0.400


# 2nd model
### Same model, but without 'final weight'

As we remember from EDA, **'final weight'** feature did not pass the significance border. Let's try to remove it and check the performance

In [169]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [170]:
get_LR_performance(data_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     19773
        >50K       0.73      0.60      0.66      6275

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.78     26048
weighted avg       0.84      0.85      0.85     26048



#### Performance in generфl has not changed, let's check if features' significances have changed

In [171]:
summary = logit_summary(data_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3182747664278952
            Iterations: 612
            Function evaluations: 615
            Gradient evaluations: 612


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25965.0
Method:,MLE,Df Model:,82.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,0.4235
Time:,17:29:08,Log-Likelihood:,-8290.4
converged:,True,LL-Null:,-14382.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2676,0.010,26.238,0.000,0.248,0.288
stand_scaler__Age,0.3473,0.025,14.033,0.000,0.299,0.396
stand_scaler__Capital Gain,2.2785,0.082,27.682,0.000,2.117,2.440
stand_scaler__Capital Loss,0.2540,0.017,15.245,0.000,0.221,0.287
stand_scaler__Hours per week,0.3855,0.022,17.308,0.000,0.342,0.429
onehot__Workclass_ Local-gov,-0.8170,0.125,-6.562,0.000,-1.061,-0.573
onehot__Workclass_ Never-worked,-1.6869,,,,,
onehot__Workclass_ Private,-0.5981,0.103,-5.784,0.000,-0.801,-0.395
onehot__Workclass_ Self-emp-inc,-0.4797,0.136,-3.540,0.000,-0.745,-0.214


According to Logit() results, all of numerical features are statistically significant. Some categoties in a couple of categotical features have to be clustered as they are insignificant. 

Assumption 1. Workclasses representatives, that do not work or work without pay will have less than 50k, so can become one cluster.

Assumption 2. Single people tend to earn more, as they have more free time for career development; so values of Marital Status feature can be clustered to Sigle and Married 

Assumption 3. Occupation has no impact on Income, as all categories are insignificant, so could be removed from the model. But before, they will be left like this, as from the EDA we saw that this feature is significant

Assumption 4. All categories of Relationship and Sex features are significant.

Assumption 5. Most of Ethnic Groupі have no impact on target, it's possible to cluster them according to their inbalancy: white and others

# 3rd model
### Clustering categories of features


In [273]:
data_clustered = data_train.copy()

def cluster_categorical(data):

    # cluster Workclass
    data['Workclass'] = data['Workclass'].replace({' Never-worked': ' Without-pay'})

    # cluster Marital status
    data.loc[
        lambda x: x["Marital Status"].isin([' Widowed', ' Separated', ' Married-spouse-absent', ' Never-married', ' Divorced']), "Marital Status"
    ] = "Single"

    data.loc[
        lambda x: x["Marital Status"].isin([' Married-AF-spouse', ' Married-civ-spouse']), "Marital Status"
    ] = "Married"

     # cluster Relationship
    data.loc[
        lambda x: x["Relationship"].isin([' Husband', ' Wife', ' Own-child']), "Relationship"
    ] = "Family"

    data.loc[
        lambda x: x["Relationship"].isin([' Not-in-family', ' Unmarried', ' Other-relative']), "Relationship"
    ] = "Not-in-Family"

    # cluster Countries
    data.loc[
        lambda x: x["Country"].isin([' Holand-Netherlands', ' Scotland', ' Italy', ' England', ' Ireland', ' Germany', ' Hong',  ' France', ' Taiwan', 
                                    ' Japan', ' Puerto-Rico', ' Canada', ' United-States']), "Country"
    ] = "Developed"

    data.loc[
        lambda x: x["Country"].isin([' Hungary', ' Greece', ' Portugal', ' Poland', ' Yugoslavia', ' Cambodia', ' Iran',  ' Philippines', ' Laos', ' Thailand', ' Vietnam', ' South', 
                                    ' China', ' India', ' Honduras', ' Outlying-US(Guam-USVI-etc)', ' Trinadad&Tobago', ' Ecuador',  ' Philippines', ' Nicaragua',
                                    ' Peru', ' Haiti', ' Columbia', ' Guatemala', ' Dominican-Republic', ' Jamaica',  ' Cuba', ' El-Salvador', ' Mexico']), "Country"
    ] = "Developing"

    return data

data_clustered = cluster_categorical(data_clustered)

In [274]:
data_clustered.head()

Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
6293,<=50K,37,Private,333651,HS-grad,Single,Adm-clerical,Not-in-Family,White,Male,0,0,42,Developed
17719,<=50K,22,Private,237720,HS-grad,Single,Craft-repair,Family,White,Male,0,0,40,Developed
18084,<=50K,34,Private,113198,Assoc-acdm,Married,Adm-clerical,Family,White,Male,0,0,28,Developed
22008,>50K,42,Private,303155,Bachelors,Married,Exec-managerial,Family,White,Male,0,0,50,Developed
23423,>50K,30,Private,19302,Some-college,Married,Prof-specialty,Family,White,Male,7688,0,40,Developed


Let's now apply the pipeline to updated dataset

In [275]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [271]:
get_LR_performance(data_clustered, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     19773
        >50K       0.73      0.60      0.66      6275

    accuracy                           0.85     26048
   macro avg       0.81      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048



In [272]:
summary = logit_summary(data_clustered, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.32603034970043704
            Iterations: 275
            Function evaluations: 277
            Gradient evaluations: 275


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26013.0
Method:,MLE,Df Model:,34.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,0.4095
Time:,18:11:11,Log-Likelihood:,-8492.4
converged:,True,LL-Null:,-14382.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2335,0.009,25.247,0.000,0.215,0.252
stand_scaler__Age,0.3645,0.023,15.772,0.000,0.319,0.410
stand_scaler__Capital Gain,2.2941,0.082,27.984,0.000,2.133,2.455
stand_scaler__Capital Loss,0.2614,0.017,15.804,0.000,0.229,0.294
stand_scaler__Hours per week,0.3857,0.022,17.590,0.000,0.343,0.429
onehot__Workclass_ Local-gov,-1.1549,0.116,-9.932,0.000,-1.383,-0.927
onehot__Workclass_ Private,-0.9503,0.094,-10.091,0.000,-1.135,-0.766
onehot__Workclass_ Self-emp-inc,-0.7983,0.129,-6.172,0.000,-1.052,-0.545
onehot__Workclass_ Self-emp-not-inc,-1.4344,0.114,-12.605,0.000,-1.657,-1.211


1. Workclass 'Without pay' is still innsignificant, will try to remove these instances (there is a small amount of them)
2. Some Occupations are insignifficant
3. Relationships became signifficant
4. Sex became insignifficant
5. All NaNs are insignifficant

Let's check unique values of Ocupation:

In [261]:
data_clustered['Occupation'].value_counts()

Occupation
 Prof-specialty       3307
 Craft-repair         3291
 Exec-managerial      3239
 Adm-clerical         3027
 Sales                2910
 Other-service        2658
 Machine-op-inspct    1599
 Transport-moving     1286
 Handlers-cleaners    1085
 Farming-fishing       804
 Tech-support          727
 Protective-serv       517
 Priv-house-serv       118
 Armed-Forces            4
Name: count, dtype: int64

Armed Forces is the smallest group, but has the biggest insignifficance in all configurations of the model. Let's try to remove it later

# 4th model

Let's try to remove missing data

In [182]:
data_no_nan = data_train.copy()
data_no_nan = data_no_nan.dropna(how='any')

In [183]:
data_no_nan.shape

(24116, 14)

In [184]:
data_no_nan_clustered = cluster_categorical(data_no_nan)
data_no_nan_clustered


Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
6293,<=50K,37,Private,333651,HS-grad,Single,Adm-clerical,Not-in-Family,White,Male,0,0,42,Developed
17719,<=50K,22,Private,237720,HS-grad,Single,Craft-repair,Family,White,Male,0,0,40,Developed
18084,<=50K,34,Private,113198,Assoc-acdm,Married,Adm-clerical,Family,White,Male,0,0,28,Developed
22008,>50K,42,Private,303155,Bachelors,Married,Exec-managerial,Family,White,Male,0,0,50,Developed
23423,>50K,30,Private,19302,Some-college,Married,Prof-specialty,Family,White,Male,7688,0,40,Developed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3458,<=50K,35,Private,465326,Bachelors,Married,Sales,Family,White,Male,0,0,40,Developed
25290,>50K,51,Self-emp-not-inc,124963,Prof-school,Single,Prof-specialty,Not-in-Family,White,Male,0,0,80,Developed
4066,<=50K,36,Self-emp-not-inc,288585,HS-grad,Married,Other-service,Family,Asian-Pac-Islander,Female,0,0,20,Developing
2280,<=50K,21,State-gov,258490,Some-college,Single,Prof-specialty,Not-in-Family,White,Female,0,0,20,Developed


In [185]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [186]:
get_LR_performance(data_no_nan_clustered, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     18108
        >50K       0.74      0.60      0.66      6008

    accuracy                           0.85     24116
   macro avg       0.81      0.77      0.78     24116
weighted avg       0.84      0.85      0.84     24116



In [187]:
summary = logit_summary(data_no_nan_clustered, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3319067114184243
            Iterations: 272
            Function evaluations: 275
            Gradient evaluations: 272


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,24116.0
Model:,Logit,Df Residuals:,24082.0
Method:,MLE,Df Model:,33.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,0.4088
Time:,17:29:16,Log-Likelihood:,-8004.3
converged:,True,LL-Null:,-13538.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2464,0.010,24.375,0.000,0.227,0.266
stand_scaler__Age,0.3540,0.023,15.163,0.000,0.308,0.400
stand_scaler__final weight,0.0753,0.020,3.689,0.000,0.035,0.115
stand_scaler__Capital Gain,2.2921,0.085,27.076,0.000,2.126,2.458
stand_scaler__Capital Loss,0.2608,0.017,15.138,0.000,0.227,0.295
stand_scaler__Hours per week,0.3676,0.022,16.381,0.000,0.324,0.412
onehot__Workclass_ Local-gov,-0.9913,0.121,-8.214,0.000,-1.228,-0.755
onehot__Workclass_ Private,-0.7765,0.099,-7.829,0.000,-0.971,-0.582
onehot__Workclass_ Self-emp-inc,-0.6258,0.134,-4.676,0.000,-0.888,-0.363


This approach to data preprocessing gave us the best result so far we saved computational complexity while redused the dimentionality, but the performance stayed. It's still not a good model though
# 5th model
### Let's try to apply ln() function to 'Age', 'Capital Gain' and 'Capital Loss' festures (as they are heavy tailed) before Standard Scaler to normalize it

In [188]:
data_logged = data_train.copy()
data_logged = data_logged.dropna(how='any')

In [189]:
data_logged['Capital Gain'] = np.log(1+ data_logged['Capital Gain'])
data_logged['Capital Loss'] = np.log(1+ data_logged['Capital Loss'])
data_logged['Age'] = np.log(data_logged['Age'])

data_logged = cluster_categorical(data_logged)

In [190]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [191]:
get_LR_performance(data_logged, numerical_features_list, categorical_features_list)

f1 score: mean = 0.77 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.87      0.92      0.90     18108
        >50K       0.72      0.59      0.65      6008

    accuracy                           0.84     24116
   macro avg       0.79      0.76      0.77     24116
weighted avg       0.83      0.84      0.83     24116



In [192]:
summary = logit_summary(data_logged, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.34236877028818563
            Iterations: 264
            Function evaluations: 266
            Gradient evaluations: 264


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,24116.0
Model:,Logit,Df Residuals:,24083.0
Method:,MLE,Df Model:,32.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,0.3901
Time:,17:29:19,Log-Likelihood:,-8256.6
converged:,True,LL-Null:,-13538.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2482,0.010,25.059,0.000,0.229,0.268
stand_scaler__Age,0.4680,0.026,18.182,0.000,0.418,0.518
stand_scaler__Capital Gain,0.5098,0.018,27.982,0.000,0.474,0.545
stand_scaler__Capital Loss,0.2404,0.017,14.013,0.000,0.207,0.274
stand_scaler__Hours per week,0.3617,0.022,16.433,0.000,0.319,0.405
onehot__Workclass_ Local-gov,-1.0203,0.119,-8.601,0.000,-1.253,-0.788
onehot__Workclass_ Private,-0.7446,0.097,-7.648,0.000,-0.935,-0.554
onehot__Workclass_ Self-emp-inc,-0.5888,0.131,-4.493,0.000,-0.846,-0.332
onehot__Workclass_ Self-emp-not-inc,-1.2354,0.116,-10.674,0.000,-1.462,-1.009


Not better either.
# 6th model
### Another try is to cluster 'Hours per week' feature to part-time, fulltime and overtime workers with fulltime value for 40 hours

In [193]:
data_new = data_train.copy()
data_new = data_new.dropna(how='any')
data_new['Hours per week'] = np.where(data_new['Hours per week'] == 40, 'fulltime', 
                                   (np.where(data_new['Hours per week'] < 40, 'part-time', 'overtime')))

data_new = cluster_categorical(data_new)

In [194]:
data_new.head()

Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
6293,<=50K,37,Private,333651,HS-grad,Single,Adm-clerical,Not-in-Family,White,Male,0,0,overtime,Developed
17719,<=50K,22,Private,237720,HS-grad,Single,Craft-repair,Family,White,Male,0,0,fulltime,Developed
18084,<=50K,34,Private,113198,Assoc-acdm,Married,Adm-clerical,Family,White,Male,0,0,part-time,Developed
22008,>50K,42,Private,303155,Bachelors,Married,Exec-managerial,Family,White,Male,0,0,overtime,Developed
23423,>50K,30,Private,19302,Some-college,Married,Prof-specialty,Family,White,Male,7688,0,fulltime,Developed


In [195]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country', 'Hours per week']

In [196]:
get_LR_performance(data_new, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     18108
        >50K       0.74      0.60      0.66      6008

    accuracy                           0.85     24116
   macro avg       0.81      0.76      0.78     24116
weighted avg       0.84      0.85      0.84     24116



In [197]:
summary = logit_summary(data_new, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.330856812289121
            Iterations: 279
            Function evaluations: 281
            Gradient evaluations: 279


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,24116.0
Model:,Logit,Df Residuals:,24082.0
Method:,MLE,Df Model:,33.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,0.4106
Time:,17:29:21,Log-Likelihood:,-7978.9
converged:,True,LL-Null:,-13538.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2442,0.010,24.082,0.000,0.224,0.264
stand_scaler__Age,0.3562,0.023,15.221,0.000,0.310,0.402
stand_scaler__Capital Gain,2.2930,0.085,27.011,0.000,2.127,2.459
stand_scaler__Capital Loss,0.2603,0.017,15.079,0.000,0.226,0.294
onehot__Workclass_ Local-gov,-0.9649,0.121,-7.987,0.000,-1.202,-0.728
onehot__Workclass_ Private,-0.7626,0.099,-7.674,0.000,-0.957,-0.568
onehot__Workclass_ Self-emp-inc,-0.5525,0.134,-4.116,0.000,-0.816,-0.289
onehot__Workclass_ Self-emp-not-inc,-1.1811,0.119,-9.928,0.000,-1.414,-0.948
onehot__Workclass_ State-gov,-0.9476,0.134,-7.051,0.000,-1.211,-0.684


# 7th model
### Lets now try to cluster all minority categories of imbalanced features together

In [198]:
data_cluster_imbalanced = data_train.copy()

def balance_predictors(data):
    data['Ethnic group'] = np.where(data['Ethnic group'] != ' White', 'Other', data['Ethnic group'])
    data['Country'] = np.where(data['Country'] != ' United-States', 'Other', data['Country'])
    data['Workclass'] = np.where(data['Workclass'] != ' Private', 'Other', data['Workclass'])
    data.loc[
        lambda x: x["Marital Status"].isin([' Widowed', ' Separated', ' Married-spouse-absent', ' Never-married', ' Divorced']), "Marital Status"
    ] = "Single"    
    data['Hours per week'] = np.where(data['Hours per week'] == 40, 'fulltime', (np.where(data['Hours per week'] < 40, 'part-time', 'overtime')))
    
balance_predictors(data_cluster_imbalanced)
data_cluster_imbalanced.sample(3)

Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
24604,<=50K,19,Private,118306,HS-grad,Single,Handlers-cleaners,Own-child,White,Male,0,0,part-time,United-States
10182,<=50K,39,Private,112158,Some-college,Married-civ-spouse,Sales,Husband,White,Male,0,0,part-time,Other
7561,>50K,35,Private,163237,Bachelors,Married-civ-spouse,Sales,Husband,White,Male,0,0,overtime,United-States


In [199]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country', 'Hours per week']

In [200]:
get_LR_performance(data_cluster_imbalanced, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     19773
        >50K       0.73      0.59      0.65      6275

    accuracy                           0.85     26048
   macro avg       0.81      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048



In [201]:
summary = logit_summary(data_cluster_imbalanced, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.32063149905268484
            Iterations: 264
            Function evaluations: 267
            Gradient evaluations: 264


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26017.0
Method:,MLE,Df Model:,30.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,0.4193
Time:,17:29:25,Log-Likelihood:,-8351.8
converged:,True,LL-Null:,-14382.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2652,0.010,26.593,0.000,0.246,0.285
stand_scaler__Age,0.4018,0.024,16.814,0.000,0.355,0.449
stand_scaler__Capital Gain,2.2644,0.081,27.806,0.000,2.105,2.424
stand_scaler__Capital Loss,0.2550,0.017,15.342,0.000,0.222,0.288
onehot__Workclass_Other,-0.1078,0.045,-2.395,0.017,-0.196,-0.020
onehot__Marital Status_ Married-civ-spouse,-3.5974,0.141,-25.483,0.000,-3.874,-3.321
onehot__Marital Status_Single,-5.9784,0.330,-18.124,0.000,-6.625,-5.332
onehot__Occupation_ Armed-Forces,-11.9757,614.022,-0.020,0.984,-1215.437,1191.486
onehot__Occupation_ Craft-repair,-0.0925,0.086,-1.076,0.282,-0.261,0.076


**So far, we have checked 7 models, none of them gave any performance improvement, although we reached faster convergence. Let's now try to build the last model, that would contain all best ideas of 8 models above:**

# Final Model

In [299]:
data_final = data_train.copy()
data_final = data_final.dropna(how='any')

In [300]:
data_final['Hours per week'] = np.where(data_final['Hours per week'] == 40, 'fulltime', 
                                   (np.where(data_final['Hours per week'] < 40, 'part-time', 'overtime')))

In [301]:
data_final = cluster_categorical(data_final)
data_final = data_final[~data_final['Workclass'].isin([' Never-worked', ' Without-pay'])]
data_final = data_final[~data_final['Occupation'].isin([' Armed-Forces'])]
data_final['Occupation'] = data_final['Occupation'].replace({' Sales': ' Other-service'})

data_final.head()

Unnamed: 0,Income,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
6293,<=50K,37,Private,333651,HS-grad,Single,Adm-clerical,Not-in-Family,White,Male,0,0,overtime,Developed
17719,<=50K,22,Private,237720,HS-grad,Single,Craft-repair,Family,White,Male,0,0,fulltime,Developed
18084,<=50K,34,Private,113198,Assoc-acdm,Married,Adm-clerical,Family,White,Male,0,0,part-time,Developed
22008,>50K,42,Private,303155,Bachelors,Married,Exec-managerial,Family,White,Male,0,0,overtime,Developed
23423,>50K,30,Private,19302,Some-college,Married,Prof-specialty,Family,White,Male,7688,0,fulltime,Developed


In [302]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass',  'Hours per week', 'Marital Status',  'Occupation',
                             'Relationship', 'Ethnic group', 'Country']

In [303]:
get_LR_performance(data_final, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90     18094
        >50K       0.74      0.59      0.66      6008

    accuracy                           0.85     24102
   macro avg       0.81      0.76      0.78     24102
weighted avg       0.84      0.85      0.84     24102



In [304]:
summary = logit_summary(data_final, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.33412098089799547
            Iterations: 218
            Function evaluations: 221
            Gradient evaluations: 218


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,24102.0
Model:,Logit,Df Residuals:,24072.0
Method:,MLE,Df Model:,29.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,0.405
Time:,18:25:05,Log-Likelihood:,-8053.0
converged:,True,LL-Null:,-13534.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2402,0.010,25.121,0.000,0.221,0.259
stand_scaler__Age,0.3706,0.023,15.989,0.000,0.325,0.416
stand_scaler__final weight,0.0838,0.020,4.127,0.000,0.044,0.124
stand_scaler__Capital Gain,2.3217,0.085,27.273,0.000,2.155,2.489
stand_scaler__Capital Loss,0.2667,0.017,15.461,0.000,0.233,0.301
onehot__Workclass_ Local-gov,-1.1803,0.118,-10.024,0.000,-1.411,-0.950
onehot__Workclass_ Private,-0.9368,0.096,-9.759,0.000,-1.125,-0.749
onehot__Workclass_ Self-emp-inc,-0.6552,0.132,-4.963,0.000,-0.914,-0.396
onehot__Workclass_ Self-emp-not-inc,-1.3292,0.116,-11.410,0.000,-1.558,-1.101


#### Even though the model performance has not improved compared to initial model, we were able to the reduce the number of iterations till convergence twice. Also we've build a model, where all features are statistically signifficant

### List of changes to initial data:
##### 1. Remove all missing data
##### 2. Remove classes, that are poorly represented, including Workclass: 'Never worked' and 'Without pay'; Occupeation: 'Armed Forces'
##### 3. Clustered most of categorical features to reduce dimentionality (from **83** in the initial model to **28** in the final model)