In [1]:
import pandas as pd
from io import StringIO
import sys

# https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
df = pd.read_csv('adult.data', header=None)

In [2]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [3]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
              'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class-label']

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class-label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
class-label       32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class-label       0
dtype: int64

In [7]:
import numpy as np

In [8]:
np.unique(df['class-label'])

array([' <=50K', ' >50K'], dtype=object)

In [9]:
class_label_mapping = {' <=50K': -1, ' >50K': 1}

In [10]:
df['class-label'] = df['class-label'].map(class_label_mapping)

In [11]:
df['class-label'].values

array([-1, -1, -1, ..., -1, -1,  1], dtype=int64)

In [12]:
y = df['class-label'].values
df.drop('class-label', axis=1, inplace=True,)

In [13]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [14]:
df['capital-gain'].value_counts()

0        29849
15024      347
7688       284
7298       246
99999      159
5178        97
3103        97
4386        70
5013        69
8614        55
3325        53
2174        48
10520       43
4064        42
4650        41
14084       41
20051       37
3137        37
27828       34
594         34
3908        32
2829        31
13550       27
6849        27
14344       26
1055        25
2885        24
3411        24
4787        23
2176        23
         ...  
9562         4
1086         4
25124        4
1424         3
2961         3
7896         3
4687         3
2936         3
1173         3
5721         3
2009         3
6360         3
41310        2
6723         2
3456         2
2993         2
401          2
11678        2
2062         2
18481        2
7978         1
1639         1
2538         1
2387         1
5060         1
4931         1
1455         1
6097         1
22040        1
1111         1
Name: capital-gain, Length: 119, dtype: int64

In [15]:
df['capital-loss'].value_counts()

0       31042
1902      202
1977      168
1887      159
1848       51
1485       51
2415       49
1602       47
1740       42
1590       40
1876       39
1672       34
1564       25
2258       25
1669       24
1741       24
2001       24
1980       23
1719       22
2002       21
2051       21
1408       21
1579       20
2377       20
1721       18
1504       18
1974       18
2339       17
2179       15
1628       15
        ...  
323         3
4356        3
2267        3
3683        2
1755        2
2352        2
1648        2
1138        2
810         2
1735        2
2238        2
2754        2
3004        2
3900        2
974         2
2149        2
1816        2
3770        2
2080        1
2489        1
2282        1
2163        1
155         1
2467        1
1844        1
1411        1
1539        1
2472        1
1944        1
2201        1
Name: capital-loss, Length: 92, dtype: int64

In [16]:
# df.drop('capital-gain', axis=1, inplace=True)
# df.drop('capital-loss', axis=1, inplace=True)

In [17]:
df['age'] = df['age'].astype(float)
df['fnlwgt'] = df['fnlwgt'].astype(float)
df['education-num'] = df['education-num'].astype(float)
df['hours-per-week'] = df['hours-per-week'].astype(float)

In [18]:
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
32556,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States
32557,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States
32558,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States
32559,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0,0,20.0,United-States
32560,52.0,Self-emp-inc,287927.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40.0,United-States


In [19]:
df['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [20]:
df['education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [21]:
df['marital-status'].unique()

array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed'], dtype=object)

In [22]:
df['occupation'].unique()

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', ' ?', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv'], dtype=object)

In [23]:
df['relationship'].unique()

array([' Not-in-family', ' Husband', ' Wife', ' Own-child', ' Unmarried',
       ' Other-relative'], dtype=object)

In [24]:
df['race'].unique()

array([' White', ' Black', ' Asian-Pac-Islander', ' Amer-Indian-Eskimo',
       ' Other'], dtype=object)

In [25]:
df['sex'].unique()

array([' Male', ' Female'], dtype=object)

In [26]:
df['native-country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [27]:
df = df.replace(' ?', 'NaN')

In [28]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40.0,United-States
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13.0,United-States
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40.0,United-States
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40.0,United-States
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40.0,Cuba
5,37.0,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40.0,United-States
6,49.0,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16.0,Jamaica
7,52.0,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45.0,United-States
8,31.0,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50.0,United-States
9,42.0,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40.0,United-States


In [29]:
df = pd.get_dummies(df, columns=[
    'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race',
    'sex', 'native-country',
])

In [30]:
df.shape

(32561, 108)

In [31]:
df

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,native-country_NaN
0,39.0,77516.0,13.0,2174,0,40.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50.0,83311.0,13.0,0,0,13.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38.0,215646.0,9.0,0,0,40.0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,53.0,234721.0,7.0,0,0,40.0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,28.0,338409.0,13.0,0,0,40.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,37.0,284582.0,14.0,0,0,40.0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
6,49.0,160187.0,5.0,0,0,16.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,52.0,209642.0,9.0,0,0,45.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,31.0,45781.0,14.0,14084,0,50.0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
9,42.0,159449.0,13.0,5178,0,40.0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [32]:
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
X = imputed_data



In [33]:
pd.value_counts(pd.Series(y))

-1    24720
 1     7841
dtype: int64

In [34]:
X

array([[3.90000e+01, 7.75160e+04, 1.30000e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [5.00000e+01, 8.33110e+04, 1.30000e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.80000e+01, 2.15646e+05, 9.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [5.80000e+01, 1.51910e+05, 9.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [2.20000e+01, 2.01490e+05, 9.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [5.20000e+01, 2.87927e+05, 9.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [37]:
from libs import transform

In [38]:
import importlib

In [39]:
importlib.reload(transform)

<module 'libs.transform' from 'C:\\Users\\Lucas\\Documents\\python-machine-learning\\Kaggle\\Machine Learning on Adult Dataset\\libs\\transform.py'>

In [40]:
std = transform.Standardize()

In [41]:
std.fit(X_train)

<libs.transform.Standardize at 0x18c05536748>

In [42]:
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

In [43]:
norm = transform.Normalize()

In [44]:
norm.fit(X_train)

<libs.transform.Normalize at 0x18c0554ecc0>

In [45]:
X_train_norm = norm.transform(X_train)
X_test_norm = norm.transform(X_test)

# SGDClassifier

In [46]:
from sklearn.linear_model import SGDClassifier

In [47]:
sgd = SGDClassifier()

In [48]:
sgd.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [49]:
y_pred = sgd.predict(X_test)

In [50]:
from libs import metrics

In [51]:
importlib.reload(metrics)

<module 'libs.metrics' from 'C:\\Users\\Lucas\\Documents\\python-machine-learning\\Kaggle\\Machine Learning on Adult Dataset\\libs\\metrics.py'>

In [52]:
accuracy = metrics.accuracy(y_pred, y_test)

In [53]:
accuracy

0.7844201044119152

## Standardization

In [54]:
X_train_std

array([[-0.08316629,  5.10870615, -0.08422137, ..., -0.08484483,
        -0.08484483, -0.08484483],
       [-0.08335812, 11.64887896, -0.08436525, ..., -0.08484483,
        -0.08484483, -0.08484483],
       [-0.08326221, 29.15961473, -0.08426933, ..., -0.08484483,
        -0.08484483, -0.08484483],
       ...,
       [-0.08331016, 10.62266671, -0.08417341, ..., -0.08484483,
        -0.08484483, -0.08484483],
       [-0.08383771,  9.78301234, -0.08441321, ..., -0.08484483,
        -0.08484483, -0.08484483],
       [-0.08134387,  9.09409099, -0.08412546, ..., -0.08484483,
        -0.08484483, -0.08484483]])

In [55]:
X_test_std

array([[-0.08340608,  0.86698406, -0.08436525, ..., -0.08484483,
        -0.08484483, -0.08484483],
       [-0.08393362,  7.04387256, -0.08441321, ..., -0.08484483,
        -0.08484483, -0.08484483],
       [-0.08244691, 15.34670568, -0.08441321, ..., -0.08484483,
        -0.08484483, -0.08484483],
       ...,
       [-0.08302241, 10.09407004, -0.08436525, ..., -0.08484483,
        -0.08484483, -0.08484483],
       [-0.08201529,  1.99079151, -0.08436525, ..., -0.08484483,
        -0.08484483, -0.08484483],
       [-0.08254283,  8.12058493, -0.08426933, ..., -0.08484483,
        -0.08484483, -0.08484483]])

In [56]:
sgd.fit(X_train_std, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [57]:
y_pred = sgd.predict(X_test_std)

In [58]:
accuracy = metrics.accuracy(y_pred, y_test)

In [59]:
accuracy

0.7942471081994062

## Normalization

In [60]:
X_train_norm

array([[2.35737066e-05, 7.29390687e-02, 8.75594815e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.08795687e-05, 1.64790312e-01, 6.73534473e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.22266376e-05, 4.10713913e-01, 8.08241368e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.15531031e-05, 1.50378021e-01, 9.42948262e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.41442239e-05, 1.38585780e-01, 6.06181026e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.91680165e-05, 1.28910457e-01, 1.01030171e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [61]:
X_test_norm

array([[2.02060342e-05, 1.33676387e-02, 6.73534473e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.27971550e-05, 1.00116858e-01, 6.06181026e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.36767237e-05, 2.16723187e-01, 6.06181026e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.55943100e-05, 1.42954324e-01, 6.73534473e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.97385339e-05, 2.91505720e-02, 6.73534473e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.23296547e-05, 1.15238381e-01, 8.08241368e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [62]:
sgd.fit(X_train_norm, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [63]:
y_pred = sgd.predict(X_test_norm)

In [64]:
accuracy= metrics.accuracy(y_pred, y_test)

In [65]:
accuracy

0.7592384072064694