In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import scipy.sparse as sps
from statsmodels.api import Logit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.linear_model import LogisticRegression
from sklearn import set_config

Here the results from 'Adult EDA" file are going to be used

In [2]:
# %run "Adult EDA.ipynb"

In [3]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)

In [4]:
df = df.drop(['Education-Num'], axis = 'columns')

In [5]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [6]:
for col in categorical_features_list:
    print(f'{col}\n{df[col].value_counts()}\n')

Workclass
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: Workclass, dtype: int64

Marital Status
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: Marital Status, dtype: int64

Occupation
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: Occupation, dtype: 

As in the whole dataset (X) we have only one 'Holand-Netherlands' value in 'Country' column, we have to process it separately, because in case if it appears in the test set, model will not be able to predict target for such a record. For the initial model, where there are no changes in data, this observation will be removed

In [7]:
print(df.shape)
df_no_nl = df.copy()
df_no_nl.drop(df_no_nl.loc[df['Country']==' Holand-Netherlands'].index, inplace=True)
print(df_no_nl.shape)

(32561, 14)
(32560, 14)


In [8]:
X_df = df_no_nl.drop(['Income'], axis = 'columns')
y_df = df_no_nl['Income']

In [9]:
X, X_test, y, y_test = train_test_split(X_df, y_df, test_size= 0.2)

## 1. Features preprocessing

First all variables have to be transformed to numerical format to feed them to Logit function:

In [10]:
X_train = X.copy()
y_train = y.copy()

In [13]:
X_train.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
25506,18,Private,88642,Some-college,Never-married,Sales,Own-child,White,Male,0,0,15,United-States
24952,24,Private,230248,7th-8th,Separated,Machine-op-inspct,Own-child,White,Male,0,0,40,United-States
3668,45,Private,199590,5th-6th,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,38,Mexico
31624,29,Private,261725,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,35,United-States
19180,82,?,42435,10th,Widowed,?,Not-in-family,White,Male,0,0,20,United-States


In this dataset we have only one feature, where the order matters - Education, so it will be transformed with using OrdinalEncoder. For all the rest of categorical features the order does not matter, hense we can apply OneHotEncoder() to them.

## Initial model without changes in data

In [14]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [15]:
def get_LR_performance (X, y, numerical_features_list, categorical_features_list):
    """The function performs data preprocessing, creates pipeline with LogisticRegression model, 
        and prints it's performance out

    Args:
        X (Series object): X_train DataFrame of features
        y (array): y_train - target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder
    """
    columntransformer = ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
     make_column_selector(pattern = 'Education')),
    ('stand scaler', StandardScaler(), numerical_features_list),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='drop')
    
    pipe = make_pipeline(columntransformer, LogisticRegression(max_iter=500)).fit(X, y)
    
    y_pred = pipe.predict(X)
    
    scores = cross_val_score(pipe, X, y, cv=5, scoring='f1_macro')
    
    f1_mean_score = round(np.mean(scores),2)
    f1_std = round(np.std(scores),2)
    
    report = classification_report(y_train, y_pred, target_names=y_train.unique())
    print(f'f1 score: mean = {f1_mean_score} | std = {f1_std}\n{report}')


In [16]:
get_LR_performance(X_train, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91     19741
        >50K       0.74      0.61      0.67      6307

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.79     26048
weighted avg       0.85      0.85      0.85     26048



### Let's now understand significance of features with the help of Logit() function from statsmodel

In [15]:
def logit_summary(X, y, numerical_features: list, categorical_features: list):
    """Function performs data preprocessing and applies Logit() function. After that retuns summary which contains featues significances

    Args:
        X (Series object): X_train DataFrame of features
        y (array): y_train - target
        numerical_features_list (list): list of features, that have to be processed by Standard scaler
        categorical_features_list (list): list of features, that have to be processed by OneHotEncoder

    Returns:
        Summary: summary of statsmodel Logit() model with the help of which the decision about 
                keeping or modifying/removing a feature can be made
    """
    column_transformer = ColumnTransformer(transformers = [
        ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                          ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                          ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
         make_column_selector(pattern = 'Education')),
        ('stand_scaler', StandardScaler(), numerical_features),
        ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features)],
        remainder='drop')
    
    X_trans = column_transformer.fit_transform(X)
    
    if sps.issparse(X_trans):
        X_trans = X_trans.toarray()
        
    x_columns_names = column_transformer.get_feature_names_out()
    X_trans = pd.DataFrame(X_trans, columns = x_columns_names)
    
    y_train_df = pd.DataFrame(y)
    onehot = OneHotEncoder(dtype='int', drop='first')
    y_trans = onehot.fit_transform(y_train_df)
    y_column_name = onehot.get_feature_names_out()
    y_trans = pd.DataFrame.sparse.from_spmatrix(y_trans, columns=y_column_name)
    
    model = Logit(y_trans, X_trans).fit_regularized()
    summary = model.summary()
    
    return summary

In [16]:
summary = logit_summary(X_train, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.32951023878563057
            Iterations: 554
            Function evaluations: 557
            Gradient evaluations: 554


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25965.0
Method:,MLE,Df Model:,82.0
Date:,"Tue, 04 Apr 2023",Pseudo R-squ.:,0.4041
Time:,19:50:16,Log-Likelihood:,-8583.1
converged:,True,LL-Null:,-14404.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2484,0.010,25.017,0.000,0.229,0.268
stand_scaler__Age,0.2655,0.024,11.159,0.000,0.219,0.312
stand_scaler__final weight,0.0617,0.020,3.101,0.002,0.023,0.101
stand_scaler__Capital Gain,2.3128,0.085,27.137,0.000,2.146,2.480
stand_scaler__Capital Loss,0.2501,0.016,15.298,0.000,0.218,0.282
onehot__Workclass_ Federal-gov,0.7230,,,,,
onehot__Workclass_ Local-gov,0.0039,,,,,
onehot__Workclass_ Never-worked,-2.5138,8.086,-0.311,0.756,-18.361,13.333
onehot__Workclass_ Private,0.1913,,,,,


## Same model, but without 'final weight'

As we remember from EDA, **'final weight'** feature did not pass the significance border. Let's try to remove it and check the performance

In [17]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [18]:
get_LR_performance(X_train, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.78 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     19753
        >50K       0.73      0.59      0.65      6295

    accuracy                           0.85     26048
   macro avg       0.80      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048



#### Let's check if features' significances have changed

In [19]:
summary = logit_summary(X_train, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.32969247486066705
            Iterations: 593
            Function evaluations: 596
            Gradient evaluations: 593


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,25966.0
Method:,MLE,Df Model:,81.0
Date:,"Tue, 04 Apr 2023",Pseudo R-squ.:,0.4038
Time:,19:50:30,Log-Likelihood:,-8587.8
converged:,True,LL-Null:,-14404.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2482,0.010,25.000,0.000,0.229,0.268
stand_scaler__Age,0.2614,0.024,11.008,0.000,0.215,0.308
stand_scaler__Capital Gain,2.3125,0.085,27.147,0.000,2.146,2.479
stand_scaler__Capital Loss,0.2496,0.016,15.283,0.000,0.218,0.282
onehot__Workclass_ Federal-gov,0.6451,,,,,
onehot__Workclass_ Local-gov,-0.0701,,,,,
onehot__Workclass_ Never-worked,-3.3088,11.757,-0.281,0.778,-26.352,19.734
onehot__Workclass_ Private,0.1183,,,,,
onehot__Workclass_ Self-emp-inc,0.4674,,,,,


According to Logit() results, all of numerical features are statistically significant. Some categoties in a couple of categotical features have to be clustered as they are insignificant. 
1. Workclasses representatives, that do not work or work without pay will deffinetely have less than 50k, so can become one cluster.
2. From Marital Status feature we can cluster together those, who have no partner and those who have
3. All categories of Occupation are insignificant, so could be removed from the model. But before, they will be left like this, as from the EDA we saw that this feature is significant
3. All categories of Relationship, Ethnic Group and Sex features are significant.
4. Most of countries have no impact on target, it's possible to cluster them by the part of the world. 

In [20]:
# feature_names = second.named_steps["columntransformer"].get_feature_names_out()
# coefs = second.named_steps["logisticregression"].coef_.flatten()
# zipped = zip(feature_names, coefs)
# df = pd.DataFrame(zipped, columns=["feature", "value"])
# df["abs_value"] = df["value"].apply(lambda x: abs(x))
# df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
# df = df.sort_values("abs_value", ascending=False)

# fig, ax = plt.subplots(1, 1, figsize=(7, 12))
# sns.barplot(x="value",
#             y="feature",
#             data=df,
#            palette=df["colors"])
# ax.set_yticklabels(ax.get_yticklabels(), fontsize=7)
# ax.set_title("Top Features", fontsize=20)
# ax.set_ylabel("Feature Name", fontsize=15)
# ax.set_xlabel("Coef", fontsize=15)

## Clustering categories of features

In [None]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [22]:
X_clustered = X_train.copy()
X_clustered['Workclass'] = X_clustered['Workclass'].replace({' Without-pay': 'Non profitable', 
                                                             ' Never-worked': 'Non profitable', 
                                                             ' Self-emp-not-inc': 'Non profitable', 
                                                             ' Federal-gov': 'Government',  
                                                             ' State-gov': 'Government', 
                                                             ' Local-gov': 'Government'})
X_clustered['Relationship'] = X_clustered['Relationship'].replace({' Unmarried': ' Not-in-family', 
                                                                   ' Husband': ' Not-in-family',  
                                                                   ' Wife': ' Not-in-family', 
                                                                   ' Own-child': ' Not-in-family'})
X_clustered['Marital Status'] = X_clustered['Marital Status'].replace({' Widowed': 'Single',  
                                                                       ' Separated': 'Single',
                                                                       ' Married-spouse-absent': 'Single', 
                                                                       ' Never-married': 'Single', 
                                                                       ' Divorced': 'Single', 
                                                                       ' Married-civ-spouse': 'Married', 
                                                                       ' Married-AF-spouse': 'Married'})
X_clustered['Country'] = X_clustered['Country'].replace({' Mexico': 'Americas', 
                                                           ' Canada': 'Americas', 
                                                           ' Puerto-Rico': 'Americas', 
                                                           ' El-Salvador': 'Americas', 
                                                           ' Cuba': 'Americas', 
                                                           ' Jamaica': 'Americas', 
                                                           ' Dominican-Republic': 'Americas', 
                                                           ' Guatemala': 'Americas', 
                                                           ' Columbia': 'Americas', 
                                                           ' Haiti': 'Americas', 
                                                           ' Peru': 'Americas', 
                                                           ' Nicaragua': 'Americas', 
                                                           ' Ecuador': 'Americas', 
                                                           ' Trinadad&Tobago': 'Americas', 
                                                           ' Outlying-US(Guam-USVI-etc)': 'Americas', 
                                                           ' Honduras': 'Americas', 
                                                           ' India': 'Asia', 
                                                           ' China': 'Asia', 
                                                           ' South': 'Asia', 
                                                           ' Vietnam': 'Asia', 
                                                           ' Japan': 'Asia',
                                                           ' Taiwan': 'Asia', 
                                                           ' Thailand': 'Asia', 
                                                           ' Laos': 'Asia', 
                                                           ' Hong': 'Asia',
                                                           ' Philippines': 'Asia', 
                                                           ' Iran': 'Asia',
                                                           ' Cambodia': 'Asia',
                                                           ' Germany': 'Europe',
                                                           ' England': 'Europe', 
                                                           ' Italy': 'Europe',
                                                           ' Poland': 'Europe', 
                                                           ' Portugal': 'Europe', 
                                                           ' France': 'Europe', 
                                                           ' Greece': 'Europe', 
                                                           ' Ireland': 'Europe', 
                                                           ' Yugoslavia': 'Europe',
                                                           ' Hungary': 'Europe',
                                                           ' Scotland': 'Europe'})

In [23]:
X_clustered.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
28093,25,Private,131178,HS-grad,Single,Handlers-cleaners,Not-in-family,Black,Male,0,0,23,United-States
23915,24,?,311949,HS-grad,Single,?,Not-in-family,Asian-Pac-Islander,Female,0,0,45,?
652,35,Private,292472,Doctorate,Married,Prof-specialty,Not-in-family,Asian-Pac-Islander,Male,0,0,40,Asia
18684,59,Self-emp-inc,169982,Some-college,Married,Exec-managerial,Not-in-family,White,Male,15024,0,60,United-States
9477,19,?,369527,Some-college,Single,?,Not-in-family,White,Female,0,0,40,United-States


### Let's now apply the pipeline to updated dataset

In [24]:
get_LR_performance(X_clustered, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.77 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     19753
        >50K       0.73      0.59      0.65      6295

    accuracy                           0.85     26048
   macro avg       0.80      0.76      0.77     26048
weighted avg       0.84      0.85      0.84     26048



**The performance for the minority class is now worse than before, so we can make a conclusion, that clustering categories in this way does not perform well**

Let's try to cluster 'Country' in another way

In [25]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss']
categorical_features_list = ['Marital Status', 'Occupation', 'Workclass',
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [26]:
X_clustered = X_train.copy()
X_clustered['Workclass'] = X_clustered['Workclass'].replace({' Without-pay': 'Non profitable', 
                                                             ' Never-worked': 'Non profitable', 
                                                             ' Self-emp-not-inc': 'Non profitable', 
                                                             ' Federal-gov': 'Government',  
                                                             ' State-gov': 'Government', 
                                                             ' Local-gov': 'Government'})
X_clustered['Relationship'] = X_clustered['Relationship'].replace({' Unmarried': ' Not-in-family', 
                                                                   ' Husband': ' Not-in-family',  
                                                                   ' Wife': ' Not-in-family', 
                                                                   ' Own-child': ' Not-in-family'})
X_clustered['Marital Status'] = X_clustered['Marital Status'].replace({' Widowed': 'Single',  
                                                                       ' Separated': 'Single',
                                                                       ' Married-spouse-absent': 'Single', 
                                                                       ' Never-married': 'Single', 
                                                                       ' Divorced': 'Single', 
                                                                       ' Married-civ-spouse': 'Married', 
                                                                       ' Married-AF-spouse': 'Married'})
X_clustered['Country'] = X_clustered['Country'].replace({' United-States': 'Developed', 
                                                         ' Mexico': 'Developing', 
                                                           ' Canada': 'Developed', 
                                                           ' Puerto-Rico': 'Developed', 
                                                           ' El-Salvador': 'Developing', 
                                                           ' Cuba': 'Developing', 
                                                           ' Jamaica': 'Developing', 
                                                           ' Dominican-Republic': 'Developing', 
                                                           ' Guatemala': 'Developing', 
                                                           ' Columbia': 'Developing', 
                                                           ' Haiti': 'Developing', 
                                                           ' Peru': 'Developing', 
                                                           ' Nicaragua': 'Developing', 
                                                           ' Ecuador': 'Developing', 
                                                           ' Trinadad&Tobago': 'Developing', 
                                                           ' Outlying-US(Guam-USVI-etc)': 'Developing', 
                                                           ' Honduras': 'Developing', 
                                                           ' India': 'Developing', 
                                                           ' China': 'Developing', 
                                                           ' South': 'Developing', 
                                                           ' Vietnam': 'Developing', 
                                                           ' Japan': 'Developed',
                                                           ' Taiwan': 'Developed', 
                                                           ' Thailand': 'Developing', 
                                                           ' Laos': 'Developing', 
                                                           ' Hong': 'Developed',
                                                           ' Philippines': 'Developing', 
                                                           ' Iran': 'Developing',
                                                           ' Cambodia': 'Developing',
                                                           ' Germany': 'Developed',
                                                           ' England': 'Developed', 
                                                           ' Italy': 'Developed',
                                                           ' Poland': 'Developing', 
                                                           ' Portugal': 'Developing', 
                                                           ' France': 'Developed', 
                                                           ' Greece': 'Developing', 
                                                           ' Ireland': 'Developing', 
                                                           ' Yugoslavia': 'Developing',
                                                           ' Hungary': 'Developing',
                                                           ' Scotland': 'Developed'})

In [27]:
get_LR_performance(X_clustered, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.77 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     19753
        >50K       0.73      0.59      0.65      6295

    accuracy                           0.85     26048
   macro avg       0.80      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048



In [28]:
summary = logit_summary(X_clustered, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.33716225090191854
            Iterations: 254
            Function evaluations: 257
            Gradient evaluations: 254


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26016.0
Method:,MLE,Df Model:,31.0
Date:,"Tue, 04 Apr 2023",Pseudo R-squ.:,0.3903
Time:,19:50:41,Log-Likelihood:,-8782.4
converged:,True,LL-Null:,-14404.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.2242,0.009,24.406,0.000,0.206,0.242
stand_scaler__Age,0.3088,0.022,14.137,0.000,0.266,0.352
stand_scaler__final weight,0.0687,0.020,3.500,0.000,0.030,0.107
stand_scaler__Capital Gain,2.3460,0.084,27.845,0.000,2.181,2.511
stand_scaler__Capital Loss,0.2598,0.016,15.969,0.000,0.228,0.292
onehot__Marital Status_Single,-2.4729,0.051,-48.404,0.000,-2.573,-2.373
onehot__Occupation_ Adm-clerical,0.1201,2.604,0.046,0.963,-4.984,5.224
onehot__Occupation_ Armed-Forces,-2.7487,5.011,-0.548,0.583,-12.571,7.074
onehot__Occupation_ Craft-repair,0.1239,2.604,0.048,0.962,-4.979,5.227


All categories of Workclass and Occupation variables are still insignifficant. Let's try to apply 'label encoder' to them instead of 'one hot encoder'

In [29]:
categorical_features_list = ['Marital Status','Relationship', 'Ethnic group', 'Country', 'Sex']
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Occupation', 'Workclass']

In [30]:
X_clustered = X_train.copy()

le  = LabelEncoder()

X_clustered['Workclass'] = le.fit_transform(X_clustered['Workclass'])
X_clustered['Occupation'] = le.fit_transform(X_clustered['Occupation'])

X_clustered['Relationship'] = X_clustered['Relationship'].replace({' Unmarried': ' Not-in-family', 
                                                                   ' Husband': ' Not-in-family',  
                                                                   ' Wife': ' Not-in-family', 
                                                                   ' Own-child': ' Not-in-family'})
X_clustered['Marital Status'] = X_clustered['Marital Status'].replace({' Widowed': 'Single',  
                                                                       ' Separated': 'Single',
                                                                       ' Married-spouse-absent': 'Single', 
                                                                       ' Never-married': 'Single', 
                                                                       ' Divorced': 'Single', 
                                                                       ' Married-civ-spouse': 'Married', 
                                                                       ' Married-AF-spouse': 'Married'})
X_clustered['Country'] = X_clustered['Country'].replace({' United-States': 'Developed', 
                                                         ' Mexico': 'Developing', 
                                                           ' Canada': 'Developed', 
                                                           ' Puerto-Rico': 'Developed', 
                                                           ' El-Salvador': 'Developing', 
                                                           ' Cuba': 'Developing', 
                                                           ' Jamaica': 'Developing', 
                                                           ' Dominican-Republic': 'Developing', 
                                                           ' Guatemala': 'Developing', 
                                                           ' Columbia': 'Developing', 
                                                           ' Haiti': 'Developing', 
                                                           ' Peru': 'Developing', 
                                                           ' Nicaragua': 'Developing', 
                                                           ' Ecuador': 'Developing', 
                                                           ' Trinadad&Tobago': 'Developing', 
                                                           ' Outlying-US(Guam-USVI-etc)': 'Developing', 
                                                           ' Honduras': 'Developing', 
                                                           ' India': 'Developing', 
                                                           ' China': 'Developing', 
                                                           ' South': 'Developing', 
                                                           ' Vietnam': 'Developing', 
                                                           ' Japan': 'Developed',
                                                           ' Taiwan': 'Developed', 
                                                           ' Thailand': 'Developing', 
                                                           ' Laos': 'Developing', 
                                                           ' Hong': 'Developed',
                                                           ' Philippines': 'Developing', 
                                                           ' Iran': 'Developing',
                                                           ' Cambodia': 'Developing',
                                                           ' Germany': 'Developed',
                                                           ' England': 'Developed', 
                                                           ' Italy': 'Developed',
                                                           ' Poland': 'Developing', 
                                                           ' Portugal': 'Developing', 
                                                           ' France': 'Developed', 
                                                           ' Greece': 'Developing', 
                                                           ' Ireland': 'Developing', 
                                                           ' Yugoslavia': 'Developing',
                                                           ' Hungary': 'Developing',
                                                           ' Scotland': 'Developed'})

In [31]:
X_clustered.sample()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
13685,24,4,246207,Bachelors,Single,7,Not-in-family,Black,Female,0,0,40,Developed


In [32]:
get_LR_performance(X_clustered, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.76 | std = 0.01
              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90     19753
        >50K       0.72      0.55      0.63      6295

    accuracy                           0.84     26048
   macro avg       0.79      0.74      0.76     26048
weighted avg       0.83      0.84      0.83     26048



In [33]:
summary = logit_summary(X_clustered, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3520276602630395
            Iterations: 102
            Function evaluations: 104
            Gradient evaluations: 102


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26033.0
Method:,MLE,Df Model:,14.0
Date:,"Tue, 04 Apr 2023",Pseudo R-squ.:,0.3634
Time:,19:50:46,Log-Likelihood:,-9169.6
converged:,True,LL-Null:,-14404.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.3107,0.008,39.434,0.000,0.295,0.326
stand_scaler__Age,0.3099,0.021,15.057,0.000,0.270,0.350
stand_scaler__Capital Gain,2.3649,0.082,28.860,0.000,2.204,2.526
stand_scaler__Capital Loss,0.2658,0.016,16.765,0.000,0.235,0.297
stand_scaler__Occupation,0.0860,0.019,4.525,0.000,0.049,0.123
stand_scaler__Workclass,-0.0060,0.019,-0.315,0.753,-0.043,0.031
onehot__Marital Status_Single,-2.4766,0.050,-49.788,0.000,-2.574,-2.379
onehot__Relationship_ Other-relative,-1.1590,0.229,-5.065,0.000,-1.607,-0.711
onehot__Ethnic group_ Asian-Pac-Islander,-1.9269,0.161,-11.937,0.000,-2.243,-1.610


'Workclass' feature is still insignifficant. Also 'Sex' feature stopped being significant

In [34]:
categorical_features_list = ['Marital Status','Relationship', 'Ethnic group', 'Country']
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Occupation']

In [35]:
get_LR_performance(X_clustered, y_train, numerical_features_list, categorical_features_list)

f1 score: mean = 0.76 | std = 0.0
              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90     19753
        >50K       0.72      0.56      0.63      6295

    accuracy                           0.84     26048
   macro avg       0.79      0.74      0.76     26048
weighted avg       0.83      0.84      0.83     26048



In [36]:
summary = logit_summary(X_clustered, y_train, numerical_features_list, categorical_features_list)
summary

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.35205040430465173
            Iterations: 92
            Function evaluations: 95
            Gradient evaluations: 92


0,1,2,3
Dep. Variable:,Income_ >50K,No. Observations:,26048.0
Model:,Logit,Df Residuals:,26035.0
Method:,MLE,Df Model:,12.0
Date:,"Tue, 04 Apr 2023",Pseudo R-squ.:,0.3634
Time:,19:50:51,Log-Likelihood:,-9170.2
converged:,True,LL-Null:,-14404.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ordinal__Education,0.3110,0.008,39.485,0.000,0.296,0.326
stand_scaler__Age,0.3109,0.021,15.136,0.000,0.271,0.351
stand_scaler__Capital Gain,2.3652,0.082,28.863,0.000,2.205,2.526
stand_scaler__Capital Loss,0.2660,0.016,16.769,0.000,0.235,0.297
stand_scaler__Occupation,0.0862,0.019,4.629,0.000,0.050,0.123
onehot__Marital Status_Single,-2.4964,0.046,-54.513,0.000,-2.586,-2.407
onehot__Relationship_ Other-relative,-1.1656,0.229,-5.092,0.000,-1.614,-0.717
onehot__Ethnic group_ Asian-Pac-Islander,-1.8970,0.159,-11.951,0.000,-2.208,-1.586
onehot__Ethnic group_ Black,-2.2717,0.133,-17.106,0.000,-2.532,-2.011


### All features are now significant

## Same model, but with Capital paremeters logged

In [37]:
# X_train, y_train = reset_xy(X, y)

In [38]:
# def capital_log():
#     X_train['Capital Gain'] = np.log(1+ X_train['Capital Gain'])
#     X_train['Capital Loss'] = np.log(1+ X_train['Capital Loss'])
    
# capital_log()

In [39]:
# num_no_capital = X_train[['Age', 'Hours per week']]
# scale_numerical(num_no_capital)

In [40]:
# X_train.head(2)

In [41]:
# encode_edu()
# X_train = dumm_categorical(categorical_features_df, X_train)

In [42]:
# X_train.head()

In [43]:
# model = sm.Logit(y_train, X_train).fit()
# print(model.summary())

In [44]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))
# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [45]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

## Logistic regression with previous data transformation

In [46]:
# X_train, y_train = reset_xy(X, y)

For this model all parameters, that have huge imbalance in their values, are changed to binary with categories:
1. Most popular value or the feature
2. All other values together

In [47]:
# def balance_predictors():
#     X_train['Ethnic group'] = np.where(X_train['Ethnic group'] != ' White', 'Other', X_train['Ethnic group'])
#     X_train['Country'] = np.where(X_train['Country'] != ' United-States', 'Other', X_train['Country'])
#     X_train['Workclass'] = np.where(X_train['Workclass'] != ' Private', 'Other', X_train['Workclass'])
#     X_train['Marital Status'] = np.where(((X_train['Marital Status'] == ' Widowed') |
#                                           (X_train['Marital Status'] == ' Married-spouse-absent') |
#                                           (X_train['Marital Status'] == ' Separated')), 
#                                          'Other', X_train['Marital Status'])
#     X_train['Occupation'] = np.where(((X_train['Occupation'] == ' Adm-clerical') |
#                                       (X_train['Occupation'] == ' Armed-Forces') |
#                                       (X_train['Occupation'] == ' Craft-repair') |
#                                       (X_train['Occupation'] == ' Machine-op-inspct') |
#                                       (X_train['Occupation'] == ' Priv-house-serv') |
#                                       (X_train['Occupation'] == ' Transport-moving')), 
#                                      'Other', X_train['Occupation'])
    
# balance_predictors()
# X_train.sample(3)

In [48]:
# encode_edu()

# capital_log()

# scale_numerical(num_no_fw)

In [49]:
# X_train = dumm_categorical(categorical_features_df, X_train)

In [50]:
# model = sm.Logit(y_train, X_train).fit()
# print(model.summary())

In [51]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [52]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

## Oversampling target

In [53]:
# count_class_0, count_class_1 = df['Income'].value_counts()

# y0 = df[df['Income'] == 0]
# y1 = df[df['Income'] == 1]

In [54]:
# print(y0.shape)
# print(y1.shape)

In [55]:
# y1_over = y1.sample(count_class_0, replace = True)

In [56]:
# df = pd.concat([y0, y1_over], axis = 0)

In [57]:
# df.shape

In [58]:
# X, y = reset_xy(df)

In [59]:
# balance_predictors()

In [60]:
# encode_edu()

In [61]:
# capital_log()

In [62]:
# num_no_capital = X[['Age', 'Hours per week']]
# scale_numerical(num_no_capital)

In [63]:
# categorical_features_df = X[['Workclass', 'Marital Status', 'Relationship', 'Ethnic group', 'Country']]
# X = dumm_categorical(categorical_features_df, X)

In [64]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)
# X_train.sample(3)

In [65]:
# model = sm.Logit(y_train, X_train).fit()
# print(model.summary())

In [66]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [67]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))