In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from statsmodels.api import Logit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.linear_model import LogisticRegression
from sklearn import set_config

Here the results from 'Adult EDA" file are going to be used

In [2]:
# %run "Adult EDA.ipynb"

In [3]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                 header = None, names = adult_columns)

In [4]:
df = df.drop(['Education-Num'], axis = 'columns')

In [5]:
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 
                             'Relationship', 'Ethnic group', 'Sex', 'Country']

In [6]:
for col in categorical_features_list:
    print(f'{col}\n{df[col].value_counts()}\n')

Workclass
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: Workclass, dtype: int64

Marital Status
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: Marital Status, dtype: int64

Occupation
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: Occupation, dtype: 

As in the whole dataset (X) we have only one 'Holand-Netherlands' value in 'Country' column, we have to process it separately, because in case if it appears in the test set, model will not be able to predict target for such a record. For the initial model, where there are no changes in data, this observation will be removed

In [7]:
print(df.shape)
df.drop(df.loc[df['Country']==' Holand-Netherlands'].index, inplace=True)
print(df.shape)

(32561, 14)
(32560, 14)


In [8]:
X_df = df.drop(['Income'], axis = 'columns')
y_df = df['Income']

In [9]:
X, X_test, y, y_test = train_test_split(X_df, y_df, test_size= 0.2)

## 1. Features preprocessing

First all variables have to be transformed to numerical format to feed them to Logit function:

In [10]:
X_train = X.copy()
y_train = y.copy()

In [11]:
X_train.head()

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country
14553,37,Self-emp-not-inc,218249,11th,Divorced,Prof-specialty,Unmarried,Black,Female,0,0,30,United-States
30006,43,State-gov,24763,Masters,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1887,45,United-States
20464,57,Self-emp-not-inc,275943,7th-8th,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,?
16225,57,Private,136107,9th,Married-civ-spouse,Craft-repair,Husband,Black,Male,0,0,40,United-States
12961,37,Private,117567,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,60,United-States


In this dataset we have only one feature, where the order matters - Education. Let's transform it using OrdinalEncoder. For all the rest of categorical features the order does not matter, hense we can apply OneHotEncoder() to them

## Initial model without changes in data

In [12]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss']

In [13]:
full_pipe = make_pipeline(ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
     make_column_selector(pattern = 'Education')),
    ('stand scaler', StandardScaler(), numerical_features_list),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='drop'),
                     LogisticRegression(max_iter=500))

In [14]:
set_config(display="diagram")
full_pipe

In [15]:
_ = full_pipe.fit(X_train, y_train)

In [16]:
y_pred = full_pipe.predict(X_train)
y_pred

array([' <=50K', ' >50K', ' <=50K', ..., ' <=50K', ' <=50K', ' <=50K'],
      dtype=object)

In [17]:
scores = cross_val_score(full_pipe, X_train, y_train, cv=5, scoring='f1_macro')

In [18]:
print(f"f1 score: mean = {round(np.mean(scores),2)} | std = {round(np.std(scores),2)}")

f1 score: mean = 0.78 | std = 0.01


In [19]:
print(classification_report(y_train, y_pred, target_names=y_train.unique()))

              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     19767
        >50K       0.73      0.60      0.66      6281

    accuracy                           0.85     26048
   macro avg       0.81      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048



## Same model, but without 'final weight'

In [20]:
numerical_features_list = ['Capital Gain', 'Capital Loss']

In [27]:
# Do I have to initialize the same code every time? When I don't run this code cell, cross_val_score 
# gives me the same result independently from that, which columns I include to numerical_features_list and to 
# categorical_features_list

full_pipe = make_pipeline(ColumnTransformer(transformers = [
    ('ordinal', OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',
                                      ' 12th',' HS-grad',' Some-college',' Assoc-voc',' Assoc-acdm', 
                                      ' Bachelors',' Masters',' Prof-school',' Doctorate']]),
     make_column_selector(pattern = 'Education')),
    ('stand scaler', StandardScaler(), numerical_features_list),
    ('onehot', OneHotEncoder(dtype='int', drop='first'), categorical_features_list)],
    remainder='drop'),
                     LogisticRegression(max_iter=500))

In [28]:
_ = full_pipe.fit(X_train, y_train)

In [29]:
y_pred = full_pipe.predict(X_train)
y_pred

array([' <=50K', ' >50K', ' <=50K', ..., ' <=50K', ' <=50K', ' <=50K'],
      dtype=object)

In [30]:
scores = cross_val_score(full_pipe, X_train, y_train, cv=5, scoring='f1_macro')

In [31]:
print(f"f1 score: mean = {round(np.mean(scores),2)} | std = {round(np.std(scores),2)}")

f1 score: mean = 0.78 | std = 0.01


In [32]:
print(classification_report(y_train, y_pred, target_names=y_train.unique()))

              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90     19767
        >50K       0.73      0.59      0.65      6281

    accuracy                           0.85     26048
   macro avg       0.80      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048



In [None]:
# def reset_xy (X, y):
#     X_train = X.copy()
#     y_train = y.copy()
#     X_train = X_train.drop(['final weight'], axis = 'columns')
#     return X_train, y_train

In [None]:
# X_train, y_train = reset_xy(X, y)

In [None]:
# num_no_fw = X_train[['Age', 'Capital Gain', 'Capital Loss', 'Hours per week']]
# scale_numerical(num_no_fw)

In [None]:
# encode_edu()
# X_train = dumm_categorical(categorical_features_df, X_train)

In [None]:
# model = sm.Logit(y_train, X_train).fit()
# print(model.summary())

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

The model still performs san in minority class detection

## Same model, but with Capital paremeters logged

In [None]:
# X_train, y_train = reset_xy(X, y)

In [None]:
# def capital_log():
#     X_train['Capital Gain'] = np.log(1+ X_train['Capital Gain'])
#     X_train['Capital Loss'] = np.log(1+ X_train['Capital Loss'])
    
# capital_log()

In [None]:
# num_no_capital = X_train[['Age', 'Hours per week']]
# scale_numerical(num_no_capital)

In [None]:
# X_train.head(2)

In [None]:
# encode_edu()
# X_train = dumm_categorical(categorical_features_df, X_train)

In [None]:
# X_train.head()

In [None]:
# model = sm.Logit(y_train, X_train).fit()
# print(model.summary())

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))
# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

## Logistic regression with previous data transformation

In [None]:
# X_train, y_train = reset_xy(X, y)

For this modela all parameters, that have huge imbalance in their values, are changes to binary with categories:
1. Most popular alue or the feature
2. All other values together

In [None]:
# def balance_predictors():
#     X_train['Ethnic group'] = np.where(X_train['Ethnic group'] != ' White', 'Other', X_train['Ethnic group'])
#     X_train['Country'] = np.where(X_train['Country'] != ' United-States', 'Other', X_train['Country'])
#     X_train['Workclass'] = np.where(X_train['Workclass'] != ' Private', 'Other', X_train['Workclass'])
#     X_train['Marital Status'] = np.where(((X_train['Marital Status'] == ' Widowed') |
#                                           (X_train['Marital Status'] == ' Married-spouse-absent') |
#                                           (X_train['Marital Status'] == ' Separated')), 
#                                          'Other', X_train['Marital Status'])
#     X_train['Occupation'] = np.where(((X_train['Occupation'] == ' Adm-clerical') |
#                                       (X_train['Occupation'] == ' Armed-Forces') |
#                                       (X_train['Occupation'] == ' Craft-repair') |
#                                       (X_train['Occupation'] == ' Machine-op-inspct') |
#                                       (X_train['Occupation'] == ' Priv-house-serv') |
#                                       (X_train['Occupation'] == ' Transport-moving')), 
#                                      'Other', X_train['Occupation'])
    
# balance_predictors()
# X_train.sample(3)

In [None]:
# encode_edu()

# capital_log()

# scale_numerical(num_no_fw)

In [None]:
# X_train = dumm_categorical(categorical_features_df, X_train)

In [None]:
# model = sm.Logit(y_train, X_train).fit()
# print(model.summary())

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))

## Oversampling target

In [None]:
# count_class_0, count_class_1 = df['Income'].value_counts()

# y0 = df[df['Income'] == 0]
# y1 = df[df['Income'] == 1]

In [None]:
# print(y0.shape)
# print(y1.shape)

In [None]:
# y1_over = y1.sample(count_class_0, replace = True)

In [None]:
# df = pd.concat([y0, y1_over], axis = 0)

In [None]:
# df.shape

In [None]:
# X, y = reset_xy(df)

In [None]:
# balance_predictors()

In [None]:
# encode_edu()

In [None]:
# capital_log()

In [None]:
# num_no_capital = X[['Age', 'Hours per week']]
# scale_numerical(num_no_capital)

In [None]:
# categorical_features_df = X[['Workclass', 'Marital Status', 'Relationship', 'Ethnic group', 'Country']]
# X = dumm_categorical(categorical_features_df, X)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)
# X_train.sample(3)

In [None]:
# model = sm.Logit(y_train, X_train).fit()
# print(model.summary())

In [None]:
# y_hat = model.predict(X_test)
# prediction = list(map(round, y_hat))

# print('Actual values', list(y_test.values)[:10])
# print('Predictions :', prediction[:10])

In [None]:
# cm = confusion_matrix(y_test, prediction) 
# s = sns.heatmap(cm, annot = True)
# s.set(xlabel='Predict', ylabel='Truth')
# print(cm)
# print(classification_report(y_test, prediction))