In [24]:
import os, mglearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

adult_path=os.path.join(mglearn.datasets.DATA_PATH, "adult.data")
data=pd.read_csv(
    adult_path,header=None,index_col=False,
    names=['age','workclass','fnlwgt','education','education-num','marital-status', 'occupation','relationship',
          'race','gender','capital-gain','capital-loss','hours-per-week','native-country','income'])
data=data[['age','workclass','education','gender','hours-per-week','occupation','income']]
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [5]:
print(data.gender.value_counts())

 Male      21790
 Female    10771
Name: gender, dtype: int64


In [8]:
print(data.occupation.value_counts())

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64


In [10]:
print(data)

       age          workclass      education   gender  hours-per-week  \
0       39          State-gov      Bachelors     Male              40   
1       50   Self-emp-not-inc      Bachelors     Male              13   
2       38            Private        HS-grad     Male              40   
3       53            Private           11th     Male              40   
4       28            Private      Bachelors   Female              40   
5       37            Private        Masters   Female              40   
6       49            Private            9th   Female              16   
7       52   Self-emp-not-inc        HS-grad     Male              45   
8       31            Private        Masters   Female              50   
9       42            Private      Bachelors     Male              40   
10      37            Private   Some-college     Male              80   
11      30          State-gov      Bachelors     Male              40   
12      23            Private      Bachelors   Fema

In [11]:
#get_dummies function automatically converts categorical data (or string type) into one hot encoding format

In [13]:
print('orig feats: \n {}'.format(list(data.columns)))
data_dummies=pd.get_dummies(data)
print('feats after get dummies: \n {}'.format(list(data_dummies.columns)))

orig feats: 
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']
feats after get dummies: 
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'o

In [14]:
#note that continuous features such as 'age' are unchanged

In [20]:
data_dummies.head()

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [23]:
features=data_dummies.loc[:,'age':'occupation_ Transport-moving']
X=features.values
y=data_dummies['income_ >50K'].values
print('X shape: {} , y shape: {}'.format(X.shape,y.shape))

X shape: (32561, 44) , y shape: (32561,)


In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y, random_state=0)
lr=LogisticRegression()
lr.fit(X_train,y_train)
print('train score: {}'.format(lr.score(X_train,y_train)))
print('test score: {}'.format(lr.score(X_test,y_test)))

train score: 0.8138001638001638
test score: 0.8087458543176514
