In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

Here the results from 'Adult EDA" file are going to be used

In [52]:
# %run "Adult EDA.ipynb"

In [53]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)

In [54]:
df = df.drop(['Education-Num'], axis = 'columns')

## 1. Features preprocessing

First all variables have to be transformed to numerical format to feed them to Logit function:

In [55]:
_ = df['Income'].replace({' <=50K': 0, 
                             ' >50K': 1}, 
                            inplace = True)

In [56]:
numerical_features_list = ['Age', 'Capital Gain', 'Capital Loss', 'Hours per week', 'final weight']
numerical_features_df = df[numerical_features_list]
numerical_features_df

Unnamed: 0,Age,Capital Gain,Capital Loss,Hours per week,final weight
0,39,2174,0,40,77516
1,50,0,0,13,83311
2,38,0,0,40,215646
3,53,0,0,40,234721
4,28,0,0,40,338409
...,...,...,...,...,...
32556,27,0,0,38,257302
32557,40,0,0,40,154374
32558,58,0,0,40,151910
32559,22,0,0,20,201490


In [57]:
X_train = df.drop(['Income'], axis = 'columns')
y_train = df['Income']

In [58]:
stand_scaler = StandardScaler()

In [59]:
def scale_numerical(numerical_features):
    X_train[numerical_features.columns.values] = stand_scaler.fit_transform(numerical_features.values)
    
scale_numerical(numerical_features_df)

In [60]:
X_train.sample()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
28426,-0.775768,Private,-0.070436,HS-grad,Never-married,Other-service,Not-in-family,Black,Male,-0.14592,-0.21666,-0.035429,United-States


In this dataset we have only one feature, where the order matters - Education. Let's transform it using OrdinalEncoder. 

In [61]:
edu_encoder = OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']])

In [62]:
def encode_edu():
    X_train[['Education']] = edu_encoder.fit_transform(df[['Education']])
    
encode_edu()

For all the rest of categorical features the order does not matter, hense we can label encode them

In [63]:
categorical_features_df = X_train[['Workclass', 'Marital Status', 'Occupation', 'Relationship', 'Ethnic group', 'Sex', 'Country']]

In [64]:
def dumm_categorical(categolical_df: pd.DataFrame, x_df:pd.DataFrame) -> pd.DataFrame:
    for col in categolical_df:
        dum_df = pd.get_dummies(x_df[col], drop_first=True, columns = x_df[col].unique(), prefix=col)
        x_df = x_df.drop(col, axis = 'columns')
        x_df = pd.concat([x_df, dum_df], axis=1, join='inner')
    return x_df
X_train = dumm_categorical(categorical_features_df, X_train)
X_train.sample()

Unnamed: 0,Age,final weight,Education,Capital Gain,Capital Loss,Hours per week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,...,Country_ Portugal,Country_ Puerto-Rico,Country_ Scotland,Country_ South,Country_ Taiwan,Country_ Thailand,Country_ Trinadad&Tobago,Country_ United-States,Country_ Vietnam,Country_ Yugoslavia
6611,2.01011,-1.086179,8.0,-0.14592,-0.21666,-1.250276,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [65]:
X_tr, X_test, y_tr, y_test = train_test_split(X_train, y_train, test_size= 0.2)

In [66]:
X_tr.shape

(26048, 85)

In [67]:
y_tr.shape

(26048,)

## Initial model without changes in data

In [68]:
model = sm.Logit(y_tr, X_tr).fit()
print(model.summary())

         Current function value: 0.318321
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                 Income   No. Observations:                26048
Model:                          Logit   Df Residuals:                    25964
Method:                           MLE   Df Model:                           83
Date:                Wed, 29 Mar 2023   Pseudo R-squ.:                  0.4199
Time:                        22:08:32   Log-Likelihood:                -8291.6
converged:                      False   LL-Null:                       -14293.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Age                                       0.3289      0.025     13.290      0.000       0.



In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

According to Logit Regression Results there are 2 statistically insignificant features: final weight and occupation. Let's remove them for the following models

## Same model, but without 'final weight'

In [45]:
def reset_xy (df):
    X = df.drop(['Income', 'final weight'], axis = 'columns')
    y = df['Income']
    return X, y

In [46]:
X_train, y_train = reset_xy(df)

In [47]:
num_no_fw = X_train[['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']]
scale_numerical(num_no_fw)

In [48]:
encode_edu()
X_train = dumm_categorical(categorical_features_df, X_train)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size= 0.2)
X_train

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hours per week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,...,Country_ Portugal,Country_ Puerto-Rico,Country_ Scotland,Country_ South,Country_ Taiwan,Country_ Thailand,Country_ Trinadad&Tobago,Country_ United-States,Country_ Vietnam,Country_ Yugoslavia
6666,1.423610,6.0,-0.14592,-0.21666,0.612489,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
9078,-1.142331,11.0,-0.14592,-0.21666,-0.035429,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
18988,-0.922393,8.0,-0.14592,-0.21666,-0.845327,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10681,1.350297,5.0,-0.14592,-0.21666,-0.035429,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
30777,-1.435581,9.0,-0.14592,-0.21666,-2.627102,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13030,1.130359,8.0,-0.14592,-0.21666,-0.035429,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3261,-0.922393,8.0,-0.14592,-0.21666,-0.035429,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
27544,-1.142331,9.0,-0.14592,-0.21666,0.531499,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2854,-0.849080,9.0,-0.14592,-0.21666,-0.035429,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [50]:
model = sm.Logit(y_train, X_train).fit()
print(model.summary())

         Current function value: 0.320364
         Iterations: 35


LinAlgError: Singular matrix

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

The model still performs san in minority class detection

## Same model, but with Capital paremeters logged

In [None]:
X_train, y_train = reset_xy(df)

In [None]:
def capital_log():
    X_train['Capital Gain'] = np.log(1+ X_train['Capital Gain'])
    X_train['Capital Loss'] = np.log(1+ X_train['Capital Loss'])
    
capital_log()

In [None]:
num_no_capital = X_train[['Age', 'Hours per week']]
scale_numerical(num_no_capital)

In [None]:
X_train.head(2)

In [None]:
encode_edu()
X_train = dumm_categorical(categorical_features_df, X_train)

In [None]:
X_train.head()

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)

In [None]:
model = sm.Logit(y_train, X_train).fit()
print(model.summary())

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))
# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

## Logistic regression with previous data transformation

In [None]:
X_train, y_train = reset_xy(df)

For this modela all parameters, that have huge imbalance in their values, are changes to binary with categories:
1. Most popular alue or the feature
2. All other values together

In [None]:
def balance_predictors():
    X_train['Ethnic group'] = np.where(X_train['Ethnic group'] != ' White', 'Other', X_train['Ethnic group'])
    X_train['Country'] = np.where(X_train['Country'] != ' United-States', 'Other', X_train['Country'])
    X_train['Workclass'] = np.where(X_train['Workclass'] != ' Private', 'Other', X_train['Workclass'])
    
balance_predictors()
X_train.sample(3)

In [None]:
encode_edu()

capital_log()

scale_numerical(num_no_fw)

In [None]:
X_train.sample()

In [None]:
X_train = dumm_categorical(categorical_features_df, X_train)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)

In [None]:
model = sm.Logit(y_train, X_train).fit()
print(model.summary())

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

## Oversampling target

In [None]:
# count_class_0, count_class_1 = df['Income'].value_counts()

# y0 = df[df['Income'] == 0]
# y1 = df[df['Income'] == 1]

In [None]:
# print(y0.shape)
# print(y1.shape)

In [None]:
# y1_over = y1.sample(count_class_0, replace = True)

In [None]:
# df = pd.concat([y0, y1_over], axis = 0)

In [None]:
# df.shape

In [None]:
# X, y = reset_xy(df)

In [None]:
# balance_predictors()

In [None]:
# encode_edu()

In [None]:
# capital_log()

In [None]:
# num_no_capital = X[['Age', 'Hours per week']]
# scale_numerical(num_no_capital)

In [None]:
# categorical_features_df = X[['Workclass', 'Marital Status', 'Relationship', 'Ethnic group', 'Country']]
# X = dumm_categorical(categorical_features_df, X)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)
# X_train.sample(3)

In [None]:
# model = sm.Logit(y_train, X_train).fit()
# print(model.summary())

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))