In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.utils import shuffle

Here the results from 'Adult EDA" file are going to be used

In [None]:
# %run "Adult EDA.ipynb"

In [50]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)

In [51]:
df = df.drop(['Education-Num'], axis = 'columns')

In [52]:
_ = df['Income'].replace({' <=50K': 0, 
                             ' >50K': 1}, 
                            inplace = True)

In [70]:
X_df = df.drop(['Income'], axis = 'columns')
y_df = df['Income']

In [71]:
X, X_test, y, y_test = train_test_split(X_df, y_df, test_size= 0.2)

## 1. Features preprocessing

First all variables have to be transformed to numerical format to feed them to Logit function:

In [72]:
X_train = X.copy()
y_train = y.copy()

In [73]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week', 'final weight']
numerical_features_df = X_train[numerical_features_list]
numerical_features_df

Unnamed: 0,Age,Capital Gain,Capital Loss,Hours per week,final weight
29900,38,0,0,65,91039
415,35,0,0,50,92440
21771,27,0,0,50,399904
31137,35,0,0,50,185366
29980,20,0,0,55,203027
...,...,...,...,...,...
16947,34,4064,0,40,34848
30691,20,0,0,40,182342
26072,24,0,0,30,180931
22745,37,2105,0,50,154641


In [74]:
stand_scaler = StandardScaler()

In [75]:
def scale_numerical(numerical_features):
    X_train[numerical_features.columns.values] = stand_scaler.fit_transform(numerical_features.values)
    
scale_numerical(numerical_features_df)

In [76]:
X_train.sample()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
18197,2.07318,Local-gov,0.637326,HS-grad,Divorced,Protective-serv,Not-in-family,Black,Male,-0.145898,-0.21595,-1.648511,United-States


In this dataset we have only one feature, where the order matters - Education. Let's transform it using OrdinalEncoder. 

In [77]:
edu_encoder = OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']])

In [78]:
def encode_edu():
    X_train[['Education']] = edu_encoder.fit_transform(X_train[['Education']])
    
encode_edu()

For all the rest of categorical features the order does not matter, hense we can apply 'dummies' method to them

In [79]:
categorical_features_df = X_train[['Workclass', 'Marital Status', 'Occupation', 'Relationship', 'Ethnic group', 'Sex', 'Country']]

In [80]:
def dumm_categorical(categolical_df: pd.DataFrame, x_df:pd.DataFrame) -> pd.DataFrame:
    for col in categolical_df:
        dum_df = pd.get_dummies(x_df[col], drop_first=True, columns = x_df[col].unique(), prefix=col)
        x_df = x_df.drop(col, axis = 'columns')
        x_df = pd.concat([x_df, dum_df], axis=1, join='inner')
    return x_df
X_train = dumm_categorical(categorical_features_df, X_train)
X_train.sample()

Unnamed: 0,Age,final weight,Education,Capital Gain,Capital Loss,Hours per week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,...,Country_ Portugal,Country_ Puerto-Rico,Country_ Scotland,Country_ South,Country_ Taiwan,Country_ Thailand,Country_ Trinadad&Tobago,Country_ United-States,Country_ Vietnam,Country_ Yugoslavia
397,-1.36008,-0.501548,8.0,-0.145898,-0.21595,-0.436865,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [81]:
# X_train = shuffle(X_train)
# X_train.head()

In [82]:
X_train.head()

Unnamed: 0,Age,final weight,Education,Capital Gain,Capital Loss,Hours per week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,...,Country_ Portugal,Country_ Puerto-Rico,Country_ Scotland,Country_ South,Country_ Taiwan,Country_ Thailand,Country_ Trinadad&Tobago,Country_ United-States,Country_ Vietnam,Country_ Yugoslavia
29900,-0.045215,-0.942353,13.0,-0.145898,-0.21595,1.986428,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
415,-0.264359,-0.929003,7.0,-0.145898,-0.21595,0.774781,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
21771,-0.848744,2.000714,8.0,-0.145898,-0.21595,0.774781,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31137,-0.264359,-0.043544,12.0,-0.145898,-0.21595,0.774781,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
29980,-1.36008,0.124742,8.0,-0.145898,-0.21595,1.178663,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [83]:
y_train.head()

29900    1
415      1
21771    0
31137    0
29980    0
Name: Income, dtype: int64

## Initial model without changes in data

In [84]:
model = sm.Logit(y_train, X_train).fit()
print(model.summary())

         Current function value: 0.320336
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                 Income   No. Observations:                26048
Model:                          Logit   Df Residuals:                    25964
Method:                           MLE   Df Model:                           83
Date:                Wed, 29 Mar 2023   Pseudo R-squ.:                  0.4186
Time:                        23:23:34   Log-Likelihood:                -8344.1
converged:                      False   LL-Null:                       -14353.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Age                                       0.3362      0.025     13.625      0.000       0.



In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

According to Logit Regression Results there are 2 statistically insignificant features: final weight and occupation. Let's remove them for the following models

## Same model, but without 'final weight'

In [85]:
def reset_xy (X, y):
    X_train = X.copy()
    y_train = y.copy()
    X_train = X_train.drop(['final weight'], axis = 'columns')
    return X_train, y_train

In [86]:
X_train, y_train = reset_xy(X, y)

In [87]:
num_no_fw = X_train[['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']]
scale_numerical(num_no_fw)

In [88]:
encode_edu()
X_train = dumm_categorical(categorical_features_df, X_train)

In [89]:
model = sm.Logit(y_train, X_train).fit()
print(model.summary())

         Current function value: 0.320499
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                 Income   No. Observations:                26048
Model:                          Logit   Df Residuals:                    25965
Method:                           MLE   Df Model:                           82
Date:                Wed, 29 Mar 2023   Pseudo R-squ.:                  0.4183
Time:                        23:26:38   Log-Likelihood:                -8348.3
converged:                      False   LL-Null:                       -14353.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Age                                       0.3320      0.025     13.485      0.000       0.



In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

The model still performs san in minority class detection

## Same model, but with Capital paremeters logged

In [90]:
X_train, y_train = reset_xy(X, y)

In [91]:
def capital_log():
    X_train['Capital Gain'] = np.log(1+ X_train['Capital Gain'])
    X_train['Capital Loss'] = np.log(1+ X_train['Capital Loss'])
    
capital_log()

In [92]:
num_no_capital = X_train[['Age', 'Hours per week']]
scale_numerical(num_no_capital)

In [93]:
X_train.head(2)

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
29900,-0.045215,Private,Masters,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,1.986428,United-States
415,-0.264359,Private,12th,Divorced,Craft-repair,Not-in-family,White,Male,0.0,0.0,0.774781,United-States


In [94]:
encode_edu()
X_train = dumm_categorical(categorical_features_df, X_train)

In [95]:
X_train.head()

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hours per week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,...,Country_ Portugal,Country_ Puerto-Rico,Country_ Scotland,Country_ South,Country_ Taiwan,Country_ Thailand,Country_ Trinadad&Tobago,Country_ United-States,Country_ Vietnam,Country_ Yugoslavia
29900,-0.045215,13.0,0.0,0.0,1.986428,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
415,-0.264359,7.0,0.0,0.0,0.774781,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
21771,-0.848744,8.0,0.0,0.0,0.774781,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
31137,-0.264359,12.0,0.0,0.0,0.774781,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
29980,-1.36008,8.0,0.0,0.0,1.178663,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [96]:
model = sm.Logit(y_train, X_train).fit()
print(model.summary())

         Current function value: 0.332342
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                 Income   No. Observations:                26048
Model:                          Logit   Df Residuals:                    25965
Method:                           MLE   Df Model:                           82
Date:                Wed, 29 Mar 2023   Pseudo R-squ.:                  0.3969
Time:                        23:27:28   Log-Likelihood:                -8656.9
converged:                      False   LL-Null:                       -14353.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Age                                       0.3381      0.024     13.987      0.000       0.



In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))
# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

## Logistic regression with previous data transformation

In [97]:
X_train, y_train = reset_xy(X, y)

For this modela all parameters, that have huge imbalance in their values, are changes to binary with categories:
1. Most popular alue or the feature
2. All other values together

In [98]:
def balance_predictors():
    X_train['Ethnic group'] = np.where(X_train['Ethnic group'] != ' White', 'Other', X_train['Ethnic group'])
    X_train['Country'] = np.where(X_train['Country'] != ' United-States', 'Other', X_train['Country'])
    X_train['Workclass'] = np.where(X_train['Workclass'] != ' Private', 'Other', X_train['Workclass'])
    
balance_predictors()
X_train.sample(3)

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
4416,21,Private,Some-college,Never-married,Adm-clerical,Own-child,White,Female,0,0,10,United-States
19947,34,Private,HS-grad,Divorced,Exec-managerial,Unmarried,White,Female,2977,0,50,United-States
18612,30,Private,Bachelors,Widowed,Prof-specialty,Unmarried,White,Female,0,0,55,United-States


In [99]:
encode_edu()

capital_log()

scale_numerical(num_no_fw)

In [100]:
X_train.sample()

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
22269,0.246978,Private,12.0,Married-civ-spouse,Prof-specialty,Wife,White,Female,-0.145898,-0.21595,1.178663,United-States


In [101]:
X_train = dumm_categorical(categorical_features_df, X_train)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)

In [102]:
model = sm.Logit(y_train, X_train).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.323733
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:                 Income   No. Observations:                26048
Model:                          Logit   Df Residuals:                    26014
Method:                           MLE   Df Model:                           33
Date:                Wed, 29 Mar 2023   Pseudo R-squ.:                  0.4125
Time:                        23:28:02   Log-Likelihood:                -8432.6
converged:                       True   LL-Null:                       -14353.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Age                                       0.3369     

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

## Oversampling target

In [None]:
# count_class_0, count_class_1 = df['Income'].value_counts()

# y0 = df[df['Income'] == 0]
# y1 = df[df['Income'] == 1]

In [None]:
# print(y0.shape)
# print(y1.shape)

In [None]:
# y1_over = y1.sample(count_class_0, replace = True)

In [None]:
# df = pd.concat([y0, y1_over], axis = 0)

In [None]:
# df.shape

In [None]:
# X, y = reset_xy(df)

In [None]:
# balance_predictors()

In [None]:
# encode_edu()

In [None]:
# capital_log()

In [None]:
# num_no_capital = X[['Age', 'Hours per week']]
# scale_numerical(num_no_capital)

In [None]:
# categorical_features_df = X[['Workclass', 'Marital Status', 'Relationship', 'Ethnic group', 'Country']]
# X = dumm_categorical(categorical_features_df, X)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)
# X_train.sample(3)

In [None]:
# model = sm.Logit(y_train, X_train).fit()
# print(model.summary())

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))