In [1]:
import numpy as np
import pandas as pd

In [2]:
adult = pd.read_csv('adult.csv')
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


data cleaning

In [3]:
adult.income.unique()

array(['<=50K', '>50K'], dtype=object)

In [4]:
adult = adult.assign(Income = (adult['income'] == '>50K').astype(int))
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,Income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,0
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,0


In [5]:
missing = [col for col in adult.columns if adult[col].any() == '?']
missing

['workclass', 'occupation']

In [6]:
adult = adult.replace('?', np.nan)
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,Income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,0
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,0


preprocessing

In [7]:
y = adult['Income']

In [8]:
feature_cols = adult.columns.drop(['income', 'Income'])
X = adult[feature_cols]

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
cat_cols = [col for col in X_train.columns if X_train[col].nunique()<10 and X_train[col].dtype == 'object']
cat_cols

['workclass', 'marital.status', 'relationship', 'race', 'sex']

In [12]:
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
num_cols

['age',
 'fnlwgt',
 'education.num',
 'capital.gain',
 'capital.loss',
 'hours.per.week']

In [13]:
my_cols = num_cols + cat_cols

In [14]:
X_train_new = X_train[my_cols].copy()
X_train_new.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass,marital.status,relationship,race,sex
15282,41,208330,10,0,0,51,Private,Married-civ-spouse,Husband,White,Male
24870,25,191921,13,0,0,25,Local-gov,Never-married,Own-child,White,Male
18822,25,180212,9,0,0,40,Private,Never-married,Unmarried,Black,Female
26404,53,123092,9,0,0,40,Private,Widowed,Not-in-family,White,Female
7842,24,122272,13,0,0,40,Private,Never-married,Own-child,White,Female


In [15]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [16]:
num_transform = SimpleImputer(strategy='constant')

In [17]:
cat_transform = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown = 'ignore'))
])

In [18]:
preprocess = ColumnTransformer(transformers=[
    ('categorical', cat_transform, cat_cols),
    ('numerical', num_transform, num_cols)
])

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [22]:
model = RandomForestRegressor(n_estimators=100)

In [23]:
my_pip = Pipeline(steps=[('preprocessor', preprocess),('model', model)], verbose = True)

model fitting

In [24]:
my_pip.fit(X_train,y_train)

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.5s
[Pipeline] ............. (step 2 of 2) Processing model, total=   6.6s


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical',
                                                  Pipeline(memory=None,
                                                           steps=[('impute',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='most_frequent',
                                               

evaluation

In [25]:
preds = my_pip.predict(X_test)
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 0.1907358762328822
