In [69]:
import pandas as pd
import numpy as np
from numpy.random import RandomState
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer, precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn_pandas import DataFrameMapper
import os

In [70]:
rs = RandomState(130917)

In [71]:
adult = pd.read_csv('data/Dataset.data',header=None, delimiter=r"\s+",)
# adult.info()
adult.columns = ('age', 'workclass', 'fnlwgt', 'education', 'education_num',
                'marital_statue', 'occupation', 'relationship', 'race',
                'sex','capital_gain', 'capital_loss','hours_per_week',
                'native_country','wage')
adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8530 entries, 0 to 8529
Data columns (total 15 columns):
age               8530 non-null int64
workclass         8530 non-null object
fnlwgt            8530 non-null int64
education         8530 non-null object
education_num     8530 non-null int64
marital_statue    8529 non-null object
occupation        8529 non-null object
relationship      8529 non-null object
race              8529 non-null object
sex               8529 non-null object
capital_gain      8529 non-null float64
capital_loss      8529 non-null float64
hours_per_week    8529 non-null float64
native_country    8529 non-null object
wage              8529 non-null object
dtypes: float64(3), int64(3), object(9)
memory usage: 999.7+ KB


In [55]:
adult.isnull().values.any()
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_statue,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [56]:
adult['wage'] = adult['wage'].map({'<=50K':-1,'>50K':1})

In [57]:
adult.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_statue,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage
8525,23,Private,130959,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,2407.0,0.0,6.0,Canada,-1.0
8526,51,Private,158746,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,1902.0,60.0,United-States,1.0
8527,29,Private,498833,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0.0,0.0,40.0,Nicaragua,-1.0
8528,46,Private,193188,Masters,14,Never-married,Exec-managerial,Unmarried,White,Male,0.0,0.0,40.0,United-States,-1.0
8529,29,Self-emp-inc,136277,HS-grad,9,,,,,,,,,,


In [58]:
y_all = adult['wage']
adult.drop('wage',axis=1,inplace=True)

In [59]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_statue,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States


In [60]:
adult.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,8530.0,8530.0,8530.0,8529.0,8529.0,8529.0
mean,38.693552,188229.2,10.060492,1128.986165,87.251847,40.436042
std,13.80345,105158.9,2.574321,7903.709643,401.969239,12.429715
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,115677.0,9.0,0.0,0.0,40.0
50%,37.0,177981.5,10.0,0.0,0.0,40.0
75%,48.0,237481.8,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,3175.0,99.0


In [61]:
adult.capital_gain.value_counts()

0.0        7820
15024.0      86
7688.0       68
7298.0       64
99999.0      49
5013.0       28
5178.0       27
3103.0       26
4386.0       19
2174.0       17
3325.0       16
4650.0       12
27828.0      11
594.0        11
8614.0       11
10520.0      10
2202.0        8
6849.0        8
2354.0        7
4787.0        7
1055.0        7
3137.0        7
13550.0       7
3674.0        7
3908.0        6
2907.0        6
5455.0        6
1506.0        6
4416.0        6
2105.0        5
           ... 
11678.0       1
1731.0        1
1831.0        1
1264.0        1
5060.0        1
1424.0        1
3942.0        1
7896.0        1
6097.0        1
5556.0        1
3887.0        1
15831.0       1
34095.0       1
2993.0        1
2329.0        1
1409.0        1
41310.0       1
7978.0        1
1471.0        1
1455.0        1
25236.0       1
2635.0        1
914.0         1
9562.0        1
7262.0        1
3471.0        1
2346.0        1
3273.0        1
3781.0        1
2062.0        1
Name: capital_gain, Leng

In [19]:
adult.capital_loss.value_counts()

0.0       8134
1902.0      57
1887.0      42
1977.0      36
2415.0      14
1485.0      13
1876.0      12
1590.0      11
1848.0      10
1740.0       9
2444.0       8
2339.0       8
1719.0       8
1408.0       8
1504.0       7
2205.0       7
1602.0       7
1721.0       7
1741.0       7
1980.0       7
2002.0       6
2057.0       6
2001.0       6
1669.0       6
1672.0       5
1628.0       5
1579.0       5
1974.0       4
1762.0       4
2258.0       4
          ... 
1340.0       2
625.0        2
1092.0       2
2377.0       2
1510.0       2
2472.0       2
2231.0       2
2246.0       2
1726.0       1
1825.0       1
2603.0       1
2392.0       1
1411.0       1
3175.0       1
1617.0       1
2174.0       1
1651.0       1
2238.0       1
653.0        1
323.0        1
1735.0       1
1870.0       1
2547.0       1
2042.0       1
1138.0       1
1258.0       1
1944.0       1
2282.0       1
1911.0       1
2457.0       1
Name: capital_loss, Length: 69, dtype: int64

In [20]:
adult.drop('capital_gain', axis = 1, inplace=True)
adult.drop('capital_loss', axis = 1, inplace=True)

In [24]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_statue,occupation,relationship,race,sex,hours_per_week,native_country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,40.0,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,50.0,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,40.0,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,40.0,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,30.0,United-States


In [26]:
adult.workclass.unique()

array(['Private', 'Local-gov', '?', 'Self-emp-not-inc', 'Federal-gov',
       'State-gov', 'Self-emp-inc', 'Without-pay'], dtype=object)

In [29]:
adult.education.unique()

array(['11th', 'HS-grad', 'Assoc-acdm', 'Some-college', '10th',
       'Prof-school', '7th-8th', 'Bachelors', 'Masters', 'Doctorate',
       '5th-6th', 'Assoc-voc', '9th', '12th', '1st-4th', 'Preschool'],
      dtype=object)

In [30]:
adult.marital_statue.unique()

array(['Never-married', 'Married-civ-spouse', 'Widowed', 'Divorced',
       'Separated', 'Married-spouse-absent', 'Married-AF-spouse', nan],
      dtype=object)

In [31]:
adult.relationship.unique()

array(['Own-child', 'Husband', 'Not-in-family', 'Unmarried', 'Wife',
       'Other-relative', nan], dtype=object)

In [32]:
adult.race.unique()

array(['Black', 'White', 'Asian-Pac-Islander', 'Other',
       'Amer-Indian-Eskimo', nan], dtype=object)

In [33]:
adult.sex.unique()

array(['Male', 'Female', nan], dtype=object)

In [34]:
adult.native_country.unique()

array(['United-States', '?', 'Peru', 'Guatemala', 'Mexico',
       'Dominican-Republic', 'Ireland', 'Germany', 'Philippines',
       'Thailand', 'Haiti', 'El-Salvador', 'Puerto-Rico', 'Vietnam',
       'South', 'Columbia', 'Japan', 'India', 'Cambodia', 'Poland',
       'Laos', 'England', 'Cuba', 'Taiwan', 'Italy', 'Canada', 'Portugal',
       'China', 'Nicaragua', 'Honduras', 'Iran', 'Scotland', 'Jamaica',
       'Ecuador', 'Yugoslavia', 'Hungary', 'Hong', 'Greece',
       'Trinadad&Tobago', 'Outlying-US(Guam-USVI-etc)', 'France', nan],
      dtype=object)

In [62]:
len(adult.native_country.unique())

42

In [63]:
adult = pd.get_dummies(adult, columns=[
    "workclass", "education", "marital_statue", "occupation", "relationship",
    "race", "sex", "native_country",
])



In [72]:
adult.shape

(8530, 15)

In [73]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_statue,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [75]:
pd.value_counts(pd.Series(y_all))

-1.0    6535
 1.0    1994
Name: wage, dtype: int64

In [86]:
adult.isnull().values.any()
adult = adult.dropna()
y_all.isnull().values.any()
y_all = y_all.dropna()

In [88]:
adult.isnull().values.any()

False

In [89]:
X_train, X_test, y_train, y_test = train_test_split(
    adult, y_all, test_size=0.25, stratify=y_all, random_state=rs,
)

In [97]:
standard_scaler_cols = ["age", "fnlwgt", "education_num", "hours_per_week",]
other_cols = list(set(adult.columns) - set(standard_scaler_cols))
mapper = DataFrameMapper(
    [([col,], StandardScaler(),) for col in standard_scaler_cols] +
    [(col, None,) for col in other_cols]
)

In [98]:
other_cols

['capital_gain',
 'marital_statue',
 'occupation',
 'relationship',
 'workclass',
 'education',
 'wage',
 'native_country',
 'race',
 'capital_loss',
 'sex']

In [99]:
clf = LogisticRegression(random_state=rs,)
pipeline = Pipeline([
    ("scale", mapper,),
    ("logit", clf,)
])

In [102]:
mapper

DataFrameMapper(default=False, df_out=False,
        features=[(['age'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['fnlwgt'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['education_num'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['hours_per_week'], StandardScaler(copy=True, with_mean=True, with_st...e), ('wage', None), ('native_country', None), ('race', None), ('capital_loss', None), ('sex', None)],
        input_df=False, sparse=False)

In [103]:


strat_kfold = StratifiedKFold(10, random_state=rs,)
estimator = GridSearchCV(
    pipeline,
    param_grid={
        "logit__C": np.power(10, np.arange(-4.0, 5.0)),
        "logit__class_weight": ["balanced", None,],
    },
    scoring=make_scorer(roc_auc_score),
    cv=strat_kfold,
)

