In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import scipy.sparse as sps
from statsmodels.api import Logit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.linear_model import LogisticRegression
from sklearn import set_config

Here the results from 'Adult EDA" file are going to be used

In [28]:
# %run "Adult EDA.ipynb"

In [91]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)

In [92]:
df = df.drop(['Education-Num'], axis = 'columns')

In [93]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [32]:
for col in categorical_features_list:
    print(f'{col}\n{df[col].value_counts()}\n')

Workclass
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: Workclass, dtype: int64

Marital Status
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: Marital Status, dtype: int64

Occupation
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: Occupation, dtype: 

As in the whole dataset (X) we have only one 'Holand-Netherlands' value in 'Country' column, we have to process it separately, because in case if it appears in the test set, model will not be able to predict target for such a record. For the initial model, where there are no changes in data, this observation will be removed

In [94]:
print(df.shape)
df_no_nl = df.copy()
df_no_nl.drop(df_no_nl.loc[df['Country']==' Holand-Netherlands'].index, inplace=True)
print(df_no_nl.shape)

(32561, 14)
(32560, 14)


In [167]:
X = df_no_nl.drop(['Income'], axis = 'columns')
y = df_no_nl['Income']

## 1. Features preprocessing

First, all variables have to be transformed to numerical format to feed them to LogisticRegression function:

In [168]:
X_train = X.copy()
y_train = y.copy()

In [170]:
X_train.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In this dataset we have only one feature, where the order matters - Education, so it will be transformed with using OrdinalEncoder. For all the rest of categorical features the order does not matter, hense we can apply OneHotEncoder() to them.

# 1st model 
### Inital model without changes in data

In [261]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [262]:
def get_LR_performance (X, y, numerical_features_list: list, categorical_features_list:list):
    """The function performs data preprocessing, creates pipeline with LogisticRegression model, 
        and prints it's performance out

    Args:
        X (Series object): X_train DataFrame of features
        y (array): y_train - target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder
    """
    columntransformer = ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
     make_column_selector(pattern = 'Education')),
    ('stand scaler', StandardScaler(), numerical_features_list),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='drop')
    
    pipe = make_pipeline(columntransformer, LogisticRegression(max_iter=500)).fit(X, y)
    
    y_pred = pipe.predict(X)
    
    scores = cross_val_score(pipe, X, y, cv=5, scoring='f1_macro')
    
    f1_mean_score = round(np.mean(scores),2)
    f1_std = round(np.std(scores),2)
    
    report = classification_report(y, y_pred, target_names=y_train.unique())
    print(f'f1 score: mean = {f1_mean_score} | std = {f1_std}\n{report}')


In [263]:
get_LR_performance(X_train, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91     24719
        >50K       0.74      0.61      0.66      7841

    accuracy                           0.85     32560
   macro avg       0.81      0.77      0.79     32560
weighted avg       0.85      0.85      0.85     32560



### Let's now understand significance of features with the help of Logit() function from statsmodel

In [174]:
def logit_summary(X, y, numerical_features: list, categorical_features: list):
    """Function performs data preprocessing and applies Logit() function. After that retuns summary which contains featues significances

    Args:
        X (Series object): X_train DataFrame of features
        y (array): y_train - target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder

    Returns:
        Summary: summary of statsmodel Logit() model with the help of which the decision about 
                keeping or modifying/removing a feature can be made
    """
    column_transformer = ColumnTransformer(transformers = [
        ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                          ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                          ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
         make_column_selector(pattern = 'Education')),
        ('stand_scaler', StandardScaler(), numerical_features),
        ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features)],
        remainder='drop')
    
    X_trans = column_transformer.fit_transform(X)
    
    if sps.issparse(X_trans):
        X_trans = X_trans.toarray()
        
    x_columns_names = column_transformer.get_feature_names_out()
    X_trans = pd.DataFrame(X_trans, columns = x_columns_names)
    
    y_train_df = pd.DataFrame(y)
    onehot = OneHotEncoder(dtype='int', drop='first')
    y_trans = onehot.fit_transform(y_train_df)
    y_column_name = onehot.get_feature_names_out()
    y_trans = pd.DataFrame.sparse.from_spmatrix(y_trans, columns=y_column_name)
    
    model = Logit(y_trans, X_trans).fit_regularized()
    summary = model.summary()
    
    return summary

In [175]:
summary = logit_summary(X_train, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3272940432823314
            Iterations: 561
            Function evaluations: 564
            Gradient evaluations: 561


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,32560.0
Model:,Logit,Df Residuals:,32477.0
Method:,MLE,Df Model:,82.0
Date:,"Thu, 06 Apr 2023",Pseudo R-squ.:,0.4071
Time:,13:21:22,Log-Likelihood:,-10657.0
converged:,True,LL-Null:,-17974.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2556,0.009,28.639,0.000,0.238,0.273
stand_scaler__Age,0.2573,0.021,12.065,0.000,0.216,0.299
stand_scaler__final weight,0.0709,0.018,3.991,0.000,0.036,0.106
stand_scaler__Capital Gain,2.3646,0.076,31.058,0.000,2.215,2.514
stand_scaler__Capital Loss,0.2641,0.015,17.927,0.000,0.235,0.293
onehot__Workclass_ Federal-gov,0.9275,1.08e+06,8.56e-07,1.000,-2.12e+06,2.12e+06
onehot__Workclass_ Local-gov,0.2477,1.08e+06,2.29e-07,1.000,-2.12e+06,2.12e+06
onehot__Workclass_ Never-worked,-2.1205,6.702,-0.316,0.752,-15.257,11.016
onehot__Workclass_ Private,0.4327,1.08e+06,4e-07,1.000,-2.12e+06,2.12e+06


# 2nd model
### Same model, but without 'final weight'

As we remember from EDA, **'final weight'** feature did not pass the significance border. Let's try to remove it and check the performance

In [264]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [265]:
get_LR_performance(X_train, y_train, numerical_features_list, categorical_features_list)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91     24719
        >50K       0.73      0.61      0.66      7841

    accuracy                           0.85     32560
   macro avg       0.81      0.77      0.78     32560
weighted avg       0.85      0.85      0.85     32560



#### Let's check if features' significances have changed

In [266]:
summary = logit_summary(X_train, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.32190023737059703
            Iterations: 522
            Function evaluations: 524
            Gradient evaluations: 522


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,32560.0
Model:,Logit,Df Residuals:,32477.0
Method:,MLE,Df Model:,82.0
Date:,"Thu, 06 Apr 2023",Pseudo R-squ.:,0.4169
Time:,14:48:18,Log-Likelihood:,-10481.0
converged:,True,LL-Null:,-17974.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2517,0.009,27.955,0.000,0.234,0.269
stand_scaler__Age,0.3209,0.022,14.650,0.000,0.278,0.364
stand_scaler__Capital Gain,2.3516,0.076,30.897,0.000,2.202,2.501
stand_scaler__Capital Loss,0.2593,0.015,17.510,0.000,0.230,0.288
stand_scaler__Hours per week,0.3734,0.020,18.900,0.000,0.335,0.412
onehot__Workclass_ Federal-gov,0.7640,,,,,
onehot__Workclass_ Local-gov,0.0951,,,,,
onehot__Workclass_ Never-worked,-1.6768,4.808,-0.349,0.727,-11.100,7.746
onehot__Workclass_ Private,0.2765,,,,,


According to Logit() results, all of numerical features are statistically significant. Some categoties in a couple of categotical features have to be clustered as they are insignificant. 
1. Workclasses representatives, that do not work or work without pay will deffinetely have less than 50k, so can become one cluster.
2. From Marital Status feature we can cluster together those, who have no partner and those who have
3. All categories of Occupation are insignificant, so could be removed from the model. But before, they will be left like this, as from the EDA we saw that this feature is significant
3. All categories of Relationship, Ethnic Group and Sex features are significant.
4. Most of countries have no impact on target, it's possible to cluster them by the part of the world. 

In [46]:
# feature_names = second.named_steps["columntransformer"].get_feature_names_out()
# coefs = second.named_steps["logisticregression"].coef_.flatten()
# zipped = zip(feature_names, coefs)
# df = pd.DataFrame(zipped, columns=["feature", "value"])
# df["abs_value"] = df["value"].apply(lambda x: abs(x))
# df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
# df = df.sort_values("abs_value", ascending=False)

# fig, ax = plt.subplots(1, 1, figsize=(7, 12))
# sns.barplot(x="value",
#             y="feature",
#             data=df,
#            palette=df["colors"])
# ax.set_yticklabels(ax.get_yticklabels(), fontsize=7)
# ax.set_title("Top Features", fontsize=20)
# ax.set_ylabel("Feature Name", fontsize=15)
# ax.set_xlabel("Coef", fontsize=15)

# 3rd model
### Clustering categories of features


In [267]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [268]:
X_clustered = X_train.copy()
X_clustered['Workclass'] = X_clustered['Workclass'].replace({' Without-pay': 'Non profitable', 
                                                             ' Never-worked': 'Non profitable', 
                                                             ' Self-emp-not-inc': 'Non profitable', 
                                                             ' Federal-gov': 'Government',  
                                                             ' State-gov': 'Government', 
                                                             ' Local-gov': 'Government'})
X_clustered['Relationship'] = X_clustered['Relationship'].replace({' Unmarried': ' Not-in-family', 
                                                                   ' Husband': ' Not-in-family',  
                                                                   ' Wife': ' Not-in-family', 
                                                                   ' Own-child': ' Not-in-family'})
X_clustered['Marital Status'] = X_clustered['Marital Status'].replace({' Widowed': 'Single',  
                                                                       ' Separated': 'Single',
                                                                       ' Married-spouse-absent': 'Single', 
                                                                       ' Never-married': 'Single', 
                                                                       ' Divorced': 'Single', 
                                                                       ' Married-civ-spouse': 'Married', 
                                                                       ' Married-AF-spouse': 'Married'})
X_clustered['Country'] = X_clustered['Country'].replace({' Mexico': 'Americas', 
                                                           ' Canada': 'Americas', 
                                                           ' Puerto-Rico': 'Americas', 
                                                           ' El-Salvador': 'Americas', 
                                                           ' Cuba': 'Americas', 
                                                           ' Jamaica': 'Americas', 
                                                           ' Dominican-Republic': 'Americas', 
                                                           ' Guatemala': 'Americas', 
                                                           ' Columbia': 'Americas', 
                                                           ' Haiti': 'Americas', 
                                                           ' Peru': 'Americas', 
                                                           ' Nicaragua': 'Americas', 
                                                           ' Ecuador': 'Americas', 
                                                           ' Trinadad&Tobago': 'Americas', 
                                                           ' Outlying-US(Guam-USVI-etc)': 'Americas', 
                                                           ' Honduras': 'Americas', 
                                                           ' India': 'Asia', 
                                                           ' China': 'Asia', 
                                                           ' South': 'Asia', 
                                                           ' Vietnam': 'Asia', 
                                                           ' Japan': 'Asia',
                                                           ' Taiwan': 'Asia', 
                                                           ' Thailand': 'Asia', 
                                                           ' Laos': 'Asia', 
                                                           ' Hong': 'Asia',
                                                           ' Philippines': 'Asia', 
                                                           ' Iran': 'Asia',
                                                           ' Cambodia': 'Asia',
                                                           ' Germany': 'Europe',
                                                           ' England': 'Europe', 
                                                           ' Italy': 'Europe',
                                                           ' Poland': 'Europe', 
                                                           ' Portugal': 'Europe', 
                                                           ' France': 'Europe', 
                                                           ' Greece': 'Europe', 
                                                           ' Ireland': 'Europe', 
                                                           ' Yugoslavia': 'Europe',
                                                           ' Hungary': 'Europe',
                                                           ' Scotland': 'Europe'})

In [269]:
X_clustered.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
0,39,Government,77516,Bachelors,Single,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Non profitable,83311,Bachelors,Married,Exec-managerial,Not-in-family,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,Single,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,Married,Handlers-cleaners,Not-in-family,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,Married,Prof-specialty,Not-in-family,Black,Female,0,0,40,Americas


Let's now apply the pipeline to updated dataset

In [270]:
get_LR_performance(X_clustered, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     24719
        >50K       0.74      0.60      0.66      7841

    accuracy                           0.85     32560
   macro avg       0.81      0.76      0.78     32560
weighted avg       0.84      0.85      0.85     32560



In [240]:
summary = logit_summary(X_clustered, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3346379605843438
            Iterations: 241
            Function evaluations: 243
            Gradient evaluations: 241


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,32560.0
Model:,Logit,Df Residuals:,32526.0
Method:,MLE,Df Model:,33.0
Date:,"Thu, 06 Apr 2023",Pseudo R-squ.:,0.3938
Time:,14:35:39,Log-Likelihood:,-10896.0
converged:,True,LL-Null:,-17974.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2301,0.008,27.872,0.000,0.214,0.246
stand_scaler__Age,0.3063,0.020,15.608,0.000,0.268,0.345
stand_scaler__final weight,0.0797,0.018,4.544,0.000,0.045,0.114
stand_scaler__Capital Gain,2.3863,0.075,31.901,0.000,2.240,2.533
stand_scaler__Capital Loss,0.2726,0.015,18.601,0.000,0.244,0.301
onehot__Workclass_ Private,-0.2742,3.194,-0.086,0.932,-6.535,5.986
onehot__Workclass_ Self-emp-inc,0.0922,3.195,0.029,0.977,-6.170,6.354
onehot__Workclass_Government,-0.3131,3.195,-0.098,0.922,-6.574,5.948
onehot__Workclass_Non profitable,-0.6734,3.194,-0.211,0.833,-6.933,5.586


The performance for the minority class is now worse than before, so we can make a conclusion, that clustering categories in this way does not perform well

# 4th model
### Let's try to cluster 'Country' in another way



In [271]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Marital Status', 'Occupation', 'Workclass',
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [272]:
X_clustered = X_train.copy()
X_clustered['Workclass'] = X_clustered['Workclass'].replace({' Without-pay': 'Non profitable', 
                                                             ' Never-worked': 'Non profitable', 
                                                             ' Self-emp-not-inc': 'Non profitable', 
                                                             ' Federal-gov': 'Government',  
                                                             ' State-gov': 'Government', 
                                                             ' Local-gov': 'Government'})

def cluster_categorical(X):
    
  X['Relationship'] = X['Relationship'].replace({' Unmarried': ' Not-in-family', 
                                                                    ' Husband': ' Not-in-family',  
                                                                    ' Wife': ' Not-in-family', 
                                                                    ' Own-child': ' Not-in-family'})
  X['Marital Status'] = X['Marital Status'].replace({' Widowed': 'Single',  
                                                                        ' Separated': 'Single',
                                                                        ' Married-spouse-absent': 'Single', 
                                                                        ' Never-married': 'Single', 
                                                                        ' Divorced': 'Single', 
                                                                        ' Married-civ-spouse': 'Married', 
                                                                        ' Married-AF-spouse': 'Married'})
  X['Country'] = X['Country'].replace({' United-States': 'Developed', 
                                                           ' Mexico': 'Developing', 
                                                           ' Canada': 'Developed', 
                                                           ' Puerto-Rico': 'Developed', 
                                                           ' El-Salvador': 'Developing', 
                                                           ' Cuba': 'Developing', 
                                                           ' Jamaica': 'Developing', 
                                                           ' Dominican-Republic': 'Developing', 
                                                           ' Guatemala': 'Developing', 
                                                           ' Columbia': 'Developing', 
                                                           ' Haiti': 'Developing', 
                                                           ' Peru': 'Developing', 
                                                           ' Nicaragua': 'Developing', 
                                                           ' Ecuador': 'Developing', 
                                                           ' Trinadad&Tobago': 'Developing', 
                                                           ' Outlying-US(Guam-USVI-etc)': 'Developing', 
                                                           ' Honduras': 'Developing', 
                                                           ' India': 'Developing', 
                                                           ' China': 'Developing', 
                                                           ' South': 'Developing', 
                                                           ' Vietnam': 'Developing', 
                                                           ' Japan': 'Developed',
                                                           ' Taiwan': 'Developed', 
                                                           ' Thailand': 'Developing', 
                                                           ' Laos': 'Developing', 
                                                           ' Hong': 'Developed',
                                                           ' Philippines': 'Developing', 
                                                           ' Iran': 'Developing',
                                                           ' Cambodia': 'Developing',
                                                           ' Germany': 'Developed',
                                                           ' England': 'Developed', 
                                                           ' Italy': 'Developed',
                                                           ' Poland': 'Developing', 
                                                           ' Portugal': 'Developing', 
                                                           ' France': 'Developed', 
                                                           ' Greece': 'Developing', 
                                                           ' Ireland': 'Developing', 
                                                           ' Yugoslavia': 'Developing',
                                                           ' Hungary': 'Developing',
                                                           ' Scotland': 'Developed'})
  return X

X_clustered = cluster_categorical(X_clustered)

In [273]:
get_LR_performance(X_clustered, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91     24719
        >50K       0.74      0.60      0.66      7841

    accuracy                           0.85     32560
   macro avg       0.81      0.77      0.78     32560
weighted avg       0.85      0.85      0.85     32560



In [274]:
summary = logit_summary(X_clustered, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3280483864800765
            Iterations: 252
            Function evaluations: 255
            Gradient evaluations: 252


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,32560.0
Model:,Logit,Df Residuals:,32527.0
Method:,MLE,Df Model:,32.0
Date:,"Thu, 06 Apr 2023",Pseudo R-squ.:,0.4057
Time:,14:49:03,Log-Likelihood:,-10681.0
converged:,True,LL-Null:,-17974.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2291,0.008,27.371,0.000,0.213,0.245
stand_scaler__Age,0.3676,0.020,18.109,0.000,0.328,0.407
stand_scaler__final weight,0.0863,0.018,4.871,0.000,0.052,0.121
stand_scaler__Capital Gain,2.3661,0.075,31.595,0.000,2.219,2.513
stand_scaler__Capital Loss,0.2680,0.015,18.170,0.000,0.239,0.297
stand_scaler__Hours per week,0.3962,0.019,20.482,0.000,0.358,0.434
onehot__Marital Status_Single,-2.4996,0.047,-53.332,0.000,-2.591,-2.408
onehot__Occupation_ Adm-clerical,0.5880,3.453,0.170,0.865,-6.179,7.355
onehot__Occupation_ Armed-Forces,-0.4268,3.719,-0.115,0.909,-7.716,6.862


Country is still sighniffacant, but all categories of Workclass and Occupation variables are insignifficant. Let's try to apply 'label encoder' to them instead of 'one hot encoder'
# 5th model

In [280]:
categorical_features_list = ['Marital Status','Relationship', 'Ethnic group', 'Country', 'Sex']
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Occupation', 'Workclass']

In [281]:
X_clustered = X_train.copy()

le  = LabelEncoder()

X_clustered['Workclass'] = le.fit_transform(X_clustered['Workclass'])
X_clustered['Occupation'] = le.fit_transform(X_clustered['Occupation'])

X_clustered = cluster_categorical(X_clustered)

In [282]:
X_clustered.sample()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
2416,55,6,59469,9th,Married,5,Not-in-family,White,Male,0,0,25,Developed


In [283]:
get_LR_performance(X_clustered, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.77 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90     24719
        >50K       0.72      0.57      0.63      7841

    accuracy                           0.84     32560
   macro avg       0.80      0.75      0.77     32560
weighted avg       0.84      0.84      0.84     32560



In [279]:
summary = logit_summary(X_clustered, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3418058795568492
            Iterations: 109
            Function evaluations: 111
            Gradient evaluations: 109


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,32560.0
Model:,Logit,Df Residuals:,32544.0
Method:,MLE,Df Model:,15.0
Date:,"Thu, 06 Apr 2023",Pseudo R-squ.:,0.3808
Time:,14:50:05,Log-Likelihood:,-11129.0
converged:,True,LL-Null:,-17974.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.3088,0.007,43.110,0.000,0.295,0.323
stand_scaler__Age,0.3559,0.019,18.620,0.000,0.318,0.393
stand_scaler__Capital Gain,2.3873,0.073,32.568,0.000,2.244,2.531
stand_scaler__Capital Loss,0.2743,0.014,19.024,0.000,0.246,0.303
stand_scaler__Hours per week,0.3939,0.018,21.584,0.000,0.358,0.430
stand_scaler__Occupation,0.0714,0.017,4.134,0.000,0.038,0.105
stand_scaler__Workclass,-0.0463,0.017,-2.679,0.007,-0.080,-0.012
onehot__Marital Status_Single,-2.4988,0.046,-54.651,0.000,-2.588,-2.409
onehot__Relationship_ Other-relative,-0.9631,0.205,-4.692,0.000,-1.365,-0.561


All features are now significant, but the performance of model in total and especially for the minority class is now worse.
# 6th model

Let's try to remove missing data

In [284]:
X_no_missing_values = X_train.copy()
y_no_missing_values = y_train.copy()
df_no_missing_values = pd.concat([X_no_missing_values, y_no_missing_values], axis = 1)
df_no_missing_values = df_no_missing_values[df_no_missing_values['Workclass'] != ' ?']
df_no_missing_values = df_no_missing_values[df_no_missing_values['Occupation'] != ' ?']
df_no_missing_values = df_no_missing_values[df_no_missing_values['Country'] != ' ?']
X_no_missing_values = df_no_missing_values.drop(['Income'], axis=1)
y_no_missing_values = df_no_missing_values['Income']

In [285]:
df_no_missing_values

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [286]:
print(X_no_missing_values.shape, y_no_missing_values.shape)
print(X_train.shape, y_train.shape)

(30161, 13) (30161,)
(32560, 13) (32560,)


In [287]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [288]:
X_no_missing_values = cluster_categorical(X_no_missing_values)

In [289]:
get_LR_performance(X_no_missing_values, y_no_missing_values, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     22653
        >50K       0.74      0.60      0.67      7508

    accuracy                           0.85     30161
   macro avg       0.81      0.77      0.78     30161
weighted avg       0.84      0.85      0.84     30161



This approach to data preprocessing gave us the best result so far - performance is just a bit better than we had for initial model. It's still not a good model though
# 7th model
### Let's try to apply ln() function to 'Age', 'Capital Gain' and 'Capital Loss' festures (as they are heavy tailed) before Standard Scaler to normalize it

In [290]:
X_logged = X_train.copy()

In [291]:
X_logged['Capital Gain'] = np.log(1+ X_logged['Capital Gain'])
X_logged['Capital Loss'] = np.log(1+ X_logged['Capital Loss'])
X_logged['Age'] = np.log(X_logged['Age'])

X_logged = cluster_categorical(X_logged)

In [292]:
get_LR_performance(X_logged, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.77 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     24719
        >50K       0.72      0.59      0.65      7841

    accuracy                           0.85     32560
   macro avg       0.80      0.76      0.78     32560
weighted avg       0.84      0.85      0.84     32560



Not better either.
# 8th model
### Another try is to cluster 'Hours per week' feature to part-time, fulltime and overtime workers with fulltime value for 40 hours

In [300]:
X_new = X_train.copy()
X_new['Hours per week'] = np.where(X_new['Hours per week'] == 40, 'fulltime', 
                                   (np.where(X_new['Hours per week'] < 40, 'part-time', 'overtime')))

X_new['Capital Gain'] = np.log(1+ X_new['Capital Gain'])
X_new['Capital Loss'] = np.log(1+ X_new['Capital Loss'])
X_new['Age'] = np.log(X_new['Age'])

X_new = cluster_categorical(X_new)

X_new = X_new.drop(['final weight'], axis='columns')

In [301]:
X_new.head()

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
0,3.663562,State-gov,Bachelors,Single,Adm-clerical,Not-in-family,White,Male,7.684784,0.0,fulltime,Developed
1,3.912023,Self-emp-not-inc,Bachelors,Married,Exec-managerial,Not-in-family,White,Male,0.0,0.0,part-time,Developed
2,3.637586,Private,HS-grad,Single,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,fulltime,Developed
3,3.970292,Private,11th,Married,Handlers-cleaners,Not-in-family,Black,Male,0.0,0.0,fulltime,Developed
4,3.332205,Private,Bachelors,Married,Prof-specialty,Not-in-family,Black,Female,0.0,0.0,fulltime,Developing


In [302]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country', 'Hours per week']

In [303]:
columntransformer = ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
     make_column_selector(pattern = 'Education')),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='passthrough')
    
pipe = make_pipeline(columntransformer, LogisticRegression(max_iter=1000)).fit(X_new, y_train)

y_pred = pipe.predict(X_new)

scores = cross_val_score(pipe, X_new, y_train, cv=5, scoring='f1_macro')

f1_mean_score = round(np.mean(scores),2)
f1_std = round(np.std(scores),2)

report = classification_report(y_train, y_pred, target_names=y_train.unique())
print(f'f1 score: mean = {f1_mean_score} | std = {f1_std}\n{report}')

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     24719
        >50K       0.72      0.59      0.65      7841

    accuracy                           0.85     32560
   macro avg       0.80      0.76      0.78     32560
weighted avg       0.84      0.85      0.84     32560



# 9th model
### Lets now try to cluster all minority categories of imbalanced features together

In [308]:
X_cluster2 = X_train.copy()
def balance_predictors(X):
    X['Ethnic group'] = np.where(X['Ethnic group'] != ' White', 'Other', X['Ethnic group'])
    X['Country'] = np.where(X['Country'] != ' United-States', 'Other', X['Country'])
    X['Workclass'] = np.where(X['Workclass'] != ' Private', 'Other', X['Workclass'])
    X['Marital Status'] = np.where(((X['Marital Status'] == ' Widowed') |
                                    (X['Marital Status'] == ' Married-spouse-absent') |
                                    (X['Marital Status'] == ' Separated')), 
                                    'Other', X_train['Marital Status'])
    X['Occupation'] = np.where(((X['Occupation'] == ' Adm-clerical') |
                                (X['Occupation'] == ' Armed-Forces') |
                                (X['Occupation'] == ' Craft-repair') |
                                (X['Occupation'] == ' Machine-op-inspct') |
                                (X['Occupation'] == ' Priv-house-serv') |
                                (X['Occupation'] == ' Transport-moving')), 
                                'Other', X['Occupation'])
    X['Hours per week'] = np.where(X['Hours per week'] == 40, 'fulltime', (np.where(X['Hours per week'] < 40, 'part-time', 'overtime')))
    
balance_predictors(X_cluster2)
X_cluster2.sample(3)

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
14443,34,Other,209317,Some-college,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
26086,25,Private,120238,Bachelors,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States
22470,31,Private,194901,HS-grad,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States


In [313]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country', 'Hours per week']

In [314]:
get_LR_performance(X_cluster2, y_train, numerical_features_list, categorical_features_list)

Traceback (most recent call last):
  File "/Users/nadiiaduiunova/opt/anaconda3/envs/final_env/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Users/nadiiaduiunova/opt/anaconda3/envs/final_env/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/nadiiaduiunova/opt/anaconda3/envs/final_env/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/Users/nadiiaduiunova/opt/anaconda3/envs/final_env/lib/python3.9/site-packages/sklearn/pipeline.py", line 480, in predict
    Xt = transform.transform(Xt)
  File "/Users/nadiiaduiunova/opt/anaconda3/envs/final_env/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/nadiiaduiuno

f1 score: mean = nan | std = nan
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91     24719
        >50K       0.74      0.60      0.66      7841

    accuracy                           0.85     32560
   macro avg       0.81      0.77      0.78     32560
weighted avg       0.85      0.85      0.85     32560

