In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix


In [2]:
data = pd.read_csv('adult.csv')
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [4]:
data.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [5]:
def fix_workclass(row):
    if (row['workclass'] == '?'):
        return 'UnknownWC'
    else:
        return row['workclass']

        
data['workclass'] = data.apply(fix_workclass, axis=1)

oh_enc = OneHotEncoder()
ohe_workclass = oh_enc.fit_transform(data[['workclass']]).toarray()
ohe_workclass = pd.DataFrame(ohe_workclass, columns=oh_enc.categories_[0])

data = data.reset_index(drop=True)
data = data.join(ohe_workclass)

data.drop(['workclass'], axis=1, inplace=True)
data

Unnamed: 0,age,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,...,income,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,UnknownWC,Without-pay
0,25,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,...,<=50K,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,38,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,...,<=50K,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,28,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,...,>50K,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,44,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,...,>50K,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,18,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,...,<=50K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,...,<=50K,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
48838,40,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,...,>50K,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
48839,58,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,...,<=50K,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
48840,22,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,...,<=50K,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [6]:
oh_enc = OneHotEncoder()
ohe_education = oh_enc.fit_transform(data[['education']]).toarray()
ohe_education = pd.DataFrame(ohe_education, columns=oh_enc.categories_[0])

data = data.reset_index(drop=True)
data = data.join(ohe_education)

data.drop(['education'], axis=1, inplace=True)
data

Unnamed: 0,age,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,...,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
0,25,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,28,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,44,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,18,103497,10,Never-married,?,Own-child,White,Female,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48838,40,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48839,58,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48840,22,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
oh_enc = OneHotEncoder()
ohe_marital = oh_enc.fit_transform(data[['marital-status']]).toarray()
ohe_marital = pd.DataFrame(ohe_marital, columns=oh_enc.categories_[0])

data = data.reset_index(drop=True)
data = data.join(ohe_marital)

data.drop(['marital-status'], axis=1, inplace=True)
data

Unnamed: 0,age,fnlwgt,educational-num,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,...,Preschool,Prof-school,Some-college,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,Separated,Widowed
0,25,226802,7,Machine-op-inspct,Own-child,Black,Male,0,0,40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38,89814,9,Farming-fishing,Husband,White,Male,0,0,50,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,28,336951,12,Protective-serv,Husband,White,Male,0,0,40,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,44,160323,10,Machine-op-inspct,Husband,Black,Male,7688,0,40,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,18,103497,10,?,Own-child,White,Female,0,0,30,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,Tech-support,Wife,White,Female,0,0,38,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48838,40,154374,9,Machine-op-inspct,Husband,White,Male,0,0,40,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48839,58,151910,9,Adm-clerical,Unmarried,White,Female,0,0,40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
48840,22,201490,9,Adm-clerical,Own-child,White,Male,0,0,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
def fix_occupation(row):
    if (row['occupation'] == '?'):
        return 'UnknownOcc'
    else:
        return row['occupation']

        
data['occupation'] = data.apply(fix_occupation, axis=1)

oh_enc = OneHotEncoder()
ohe_occupation = oh_enc.fit_transform(data[['occupation']]).toarray()
ohe_occupation = pd.DataFrame(ohe_occupation, columns=oh_enc.categories_[0])

data = data.reset_index(drop=True)
data = data.join(ohe_occupation)

data.drop(['occupation'], axis=1, inplace=True)
data

Unnamed: 0,age,fnlwgt,educational-num,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,...,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving,UnknownOcc
0,25,226802,7,Own-child,Black,Male,0,0,40,United-States,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38,89814,9,Husband,White,Male,0,0,50,United-States,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28,336951,12,Husband,White,Male,0,0,40,United-States,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,44,160323,10,Husband,Black,Male,7688,0,40,United-States,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18,103497,10,Own-child,White,Female,0,0,30,United-States,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,Wife,White,Female,0,0,38,United-States,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,40,154374,9,Husband,White,Male,0,0,40,United-States,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48839,58,151910,9,Unmarried,White,Female,0,0,40,United-States,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48840,22,201490,9,Own-child,White,Male,0,0,20,United-States,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
oh_enc = OneHotEncoder()
ohe_relationship = oh_enc.fit_transform(data[['relationship']]).toarray()
ohe_relationship = pd.DataFrame(ohe_relationship, columns=oh_enc.categories_[0])

data = data.reset_index(drop=True)
data = data.join(ohe_relationship)

data.drop(['relationship'], axis=1, inplace=True)
data

Unnamed: 0,age,fnlwgt,educational-num,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,...,Sales,Tech-support,Transport-moving,UnknownOcc,Husband,Not-in-family,Other-relative,Own-child,Unmarried,Wife
0,25,226802,7,Black,Male,0,0,40,United-States,<=50K,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38,89814,9,White,Male,0,0,50,United-States,<=50K,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,28,336951,12,White,Male,0,0,40,United-States,>50K,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,44,160323,10,Black,Male,7688,0,40,United-States,>50K,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,18,103497,10,White,Female,0,0,30,United-States,<=50K,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,White,Female,0,0,38,United-States,<=50K,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
48838,40,154374,9,White,Male,0,0,40,United-States,>50K,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
48839,58,151910,9,White,Female,0,0,40,United-States,<=50K,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
48840,22,201490,9,White,Male,0,0,20,United-States,<=50K,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
oh_enc = OneHotEncoder()
ohe_race = oh_enc.fit_transform(data[['race']]).toarray()
ohe_race = pd.DataFrame(ohe_race, columns=oh_enc.categories_[0])

data = data.reset_index(drop=True)
data = data.join(ohe_race)

data.drop(['race'], axis=1, inplace=True)
data

Unnamed: 0,age,fnlwgt,educational-num,gender,capital-gain,capital-loss,hours-per-week,native-country,income,Federal-gov,...,Not-in-family,Other-relative,Own-child,Unmarried,Wife,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,25,226802,7,Male,0,0,40,United-States,<=50K,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38,89814,9,Male,0,0,50,United-States,<=50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,28,336951,12,Male,0,0,40,United-States,>50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,44,160323,10,Male,7688,0,40,United-States,>50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,18,103497,10,Female,0,0,30,United-States,<=50K,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,Female,0,0,38,United-States,<=50K,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
48838,40,154374,9,Male,0,0,40,United-States,>50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
48839,58,151910,9,Female,0,0,40,United-States,<=50K,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
48840,22,201490,9,Male,0,0,20,United-States,<=50K,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
oh_enc = OneHotEncoder()
ohe_gender = oh_enc.fit_transform(data[['gender']]).toarray()
ohe_gender = pd.DataFrame(ohe_gender, columns=oh_enc.categories_[0])

data = data.reset_index(drop=True)
data = data.join(ohe_gender)

data.drop(['gender'], axis=1, inplace=True)
data

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,native-country,income,Federal-gov,Local-gov,...,Own-child,Unmarried,Wife,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male
0,25,226802,7,0,0,40,United-States,<=50K,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,38,89814,9,0,0,50,United-States,<=50K,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,28,336951,12,0,0,40,United-States,>50K,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,44,160323,10,7688,0,40,United-States,>50K,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,18,103497,10,0,0,30,United-States,<=50K,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,0,0,38,United-States,<=50K,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
48838,40,154374,9,0,0,40,United-States,>50K,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
48839,58,151910,9,0,0,40,United-States,<=50K,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
48840,22,201490,9,0,0,20,United-States,<=50K,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [12]:
latam = ['Mexico', 'Puerto-Rico', 'El-Salvador', 'Cuba', 'Jamaica', 'Dominican-Republic', 'Guatemala', 'Columbia', 'Haiti', 'Nicaragua', 'Peru', 'Ecuador', 'Trinadad&Tobago', 'Honduras']
aspac = ['Philippines', 'India', 'China', 'Japan', 'Vietnam', 'Taiwan', 'Hong', 'Thailand', 'Cambodia', 'Laos', 'Iran']
w_europe = ['Germany', 'England', 'Italy', 'Portugal', 'France', 'Ireland', 'Scotland', 'Holand-Netherlands']
e_europe = ['Poland', 'Hungary', 'Greece', 'Yugoslavia']

def get_region(row):
    if row['native-country'] in latam:
        return 'LatAmerica'
    elif row['native-country'] in aspac:
        return 'Asia'
    elif row['native-country'] in w_europe:
        return 'WestEurope'
    elif row['native-country'] in e_europe:
        return 'EastEurope'
    elif row['native-country'] == '?':
        return 'UnknownReg'
    elif row['native-country'] == 'United-States':
        return 'US'
    elif row['native-country'] == 'Canada':
        return 'Canada'
    elif row['native-country'] == 'Outlying-US(Guam-USVI-etc)':
        return 'US'
    elif row['native-country'] == 'South':
        return 'South'

data['region'] = data.apply(get_region, axis=1)
data['region'].value_counts()

US            43855
LatAmerica     2072
Asia            981
UnknownReg      857
WestEurope      602
Canada          182
EastEurope      178
South           115
Name: region, dtype: int64

In [13]:
oh_enc = OneHotEncoder()
ohe_region = oh_enc.fit_transform(data[['region']]).toarray()
ohe_region = pd.DataFrame(ohe_region, columns=oh_enc.categories_[0])

data = data.reset_index(drop=True)
data = data.join(ohe_region)

data.drop(['native-country'], axis=1, inplace=True)
data.drop(['region'], axis=1, inplace=True)

data

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,Federal-gov,Local-gov,Never-worked,...,Female,Male,Asia,Canada,EastEurope,LatAmerica,South,US,UnknownReg,WestEurope
0,25,226802,7,0,0,40,<=50K,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38,89814,9,0,0,50,<=50K,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,28,336951,12,0,0,40,>50K,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,44,160323,10,7688,0,40,>50K,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,18,103497,10,0,0,30,<=50K,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,0,0,38,<=50K,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,40,154374,9,0,0,40,>50K,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,58,151910,9,0,0,40,<=50K,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,22,201490,9,0,0,20,<=50K,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
data['income'].value_counts()

def get_income(row):
    if row['income'] == '>50K':
        return 1
    else:
        return 0
data['high_income'] = data.apply(get_income, axis=1)
data.drop(['income'], axis=1, inplace=True)
data
    

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,Federal-gov,Local-gov,Never-worked,Private,...,Male,Asia,Canada,EastEurope,LatAmerica,South,US,UnknownReg,WestEurope,high_income
0,25,226802,7,0,0,40,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,38,89814,9,0,0,50,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,28,336951,12,0,0,40,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,44,160323,10,7688,0,40,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
4,18,103497,10,0,0,30,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,0,0,38,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
48838,40,154374,9,0,0,40,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
48839,58,151910,9,0,0,40,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
48840,22,201490,9,0,0,20,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [15]:
data.drop(['fnlwgt'], axis=1, inplace=True)
data.drop(['capital-gain'], axis=1, inplace=True)
data.drop(['capital-loss'], axis=1, inplace=True)



X = data.drop('high_income', axis=1)  
y = data['high_income'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 2020, stratify=y)

In [16]:
model = LogisticRegression()

model.fit(X, y)
print(model.score(X_train, y_train))

0.8351809061394015


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [17]:
y_pred = model.predict(X_test)

conf_matrix_baseline = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])
display(conf_matrix_baseline)
display('Logistic Regression recall score', recall_score(y_test, y_pred))        

Unnamed: 0,predicted 0,predicted 1
actual 0,10311,836
actual 1,1556,1950


'Logistic Regression recall score'

0.5561893896177981

In [18]:
from sklearn import svm

model_SVM = svm.SVC(kernel='linear')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 2020, stratify=y)

In [19]:
model_SVM.fit(X, y)
print(model_SVM.score(X_train, y_train))

0.8352101553131124


In [25]:
y_pred = model_SVM.predict(X_test)

conf_matrix_SVM = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])
display(conf_matrix_SVM)
display('SVM recall score', recall_score(y_test, y_pred))        

Unnamed: 0,predicted 0,predicted 1
actual 0,10374,773
actual 1,1618,1888


'SVM recall score'

0.5385054192812322