In [1]:
# Class to colorize, bold, or underline output
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [2]:
import pandas as pd
import numpy as np
# define header for our data, the UCI dataset does not have a header
headers = ["age", "workclass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race",
           "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
           "income"]
# load the data from github repo and convert the values with '?' to NaN
url = "https://raw.githubusercontent.com/vbloise3/WhizLabsML/master/CensusIncome/CensusIncomeDataset.csv"
df = pd.read_csv(url, error_bad_lines=False, header=None, names=headers, na_values="null")
# head of the df
df.head(10)



  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [5]:
df.shape

(32561, 15)

In [6]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


## Split dataset into features and target

In [3]:
features_df = df.drop('income', axis=1)
target_df = df.drop(df.columns[[0,1,2,3,4,5,6,7,8,9,10,11,12,13]], axis=1)

In [9]:
features_df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

## Extract categorical features

In [11]:
categorical_featuresDf = features_df.select_dtypes(include=['object']).copy()
categorical_featuresDf

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


## Which features contains nulls

In [13]:
categorical_features_NaN = categorical_featuresDf[categorical_featuresDf.isnull().any(axis=1)]
categorical_features_NaN

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
14,Private,Assoc-voc,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,
27,,Some-college,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,South
38,Private,Some-college,Married-civ-spouse,Sales,Husband,White,Male,
51,Private,HS-grad,Never-married,Other-service,Own-child,White,Female,
61,,7th-8th,Married-spouse-absent,,Not-in-family,White,Male,
...,...,...,...,...,...,...,...,...
32530,,Bachelors,Married-civ-spouse,,Wife,White,Female,United-States
32531,,Bachelors,Never-married,,Not-in-family,Asian-Pac-Islander,Female,United-States
32539,,Doctorate,Married-civ-spouse,,Husband,White,Male,United-States
32541,,HS-grad,Separated,,Not-in-family,Black,Female,United-States


In [16]:
# Which FEATURES actually have a categorical null value
categorical_featuresDf.columns[categorical_featuresDf.isna().any()].tolist()

['workclass', 'occupation', 'native-country']

# Impute the NaN values using scikit-learn SimpleImpute Class

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

# Impute workclass values
imputer = imputer.fit(features_df[['workclass']])
features_df['workclass'] = imputer.transform(features_df[['workclass']]).ravel()
# Impute occupation values
imputer = imputer.fit(features_df[['occupation']])
features_df['occupation'] = imputer.transform(features_df[['occupation']]).ravel()
# Impute native-country values
imputer = imputer.fit(features_df[['native-country']])
features_df['native-country'] = imputer.transform(features_df[['native-country']]).ravel()

# Recreate the dataframe of only the categorical features
categorical_featuresDf = features_df.select_dtypes(include=['object']).copy()

# Recheck to find any null value entries in the categorical features
categorical_features_NaN = categorical_featuresDf[categorical_featuresDf.isnull().any(axis=1)]
# Which features hava a NaN value?
categorical_featuresDf.columns[categorical_featuresDf.isna().any()].tolist()

In [6]:
feature_headers = ['workclass','occupation','native-country']
for header in feature_headers:
    imputer = imputer.fit(features_df[[header]])
    features_df[header] = imputer.transform(features_df[[header]]).ravel()
# Recreate the dataframe of only the categorical features
categorical_featuresDf = features_df.select_dtypes(include=['object']).copy()

# Recheck to find any null value entries in the categorical features
categorical_features_NaN = categorical_featuresDf[categorical_featuresDf.isnull().any(axis=1)]
# Which features hava a NaN value?
categorical_featuresDf.columns[categorical_featuresDf.isna().any()].tolist()

[]

In [9]:
# use binary encoding for the sex feasture
from sklearn.preprocessing import LabelBinarizer
df.sex.value_counts()

 Male      21790
 Female    10771
Name: sex, dtype: int64

In [11]:
label_results = LabelBinarizer().fit_transform(categorical_featuresDf.sex)
categorical_featuresDf['sex_code'] = pd.DataFrame({'sex': label_results[:,0]})
categorical_featuresDf[{'sex', 'sex_code'}].head(15)

Unnamed: 0,sex_code,sex
0,1,Male
1,1,Male
2,1,Male
3,1,Male
4,0,Female
5,0,Female
6,0,Female
7,1,Male
8,0,Female
9,1,Male


In [13]:
from sklearn.preprocessing import LabelEncoder
categorical_featuresDf['workclass_code'] = LabelEncoder().fit_transform(categorical_featuresDf.workclass)
categorical_featuresDf[{'workclass', 'workclass_code'}].head(15)

Unnamed: 0,workclass_code,workclass
0,6,State-gov
1,5,Self-emp-not-inc
2,3,Private
3,3,Private
4,3,Private
5,3,Private
6,3,Private
7,5,Self-emp-not-inc
8,3,Private
9,3,Private


In [14]:
# Use one-hot-encoding on the working class
categorical_featuresDf.workclass.value_counts()

 Private             24532
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [15]:
pd.get_dummies(categorical_featuresDf, columns=['workclass']).head()

Unnamed: 0,education,marital-status,occupation,relationship,race,sex,native-country,sex_code,workclass_code,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay
0,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,1,6,0,0,0,0,0,0,1,0
1,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,1,5,0,0,0,0,0,1,0,0
2,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,1,3,0,0,0,1,0,0,0,0
3,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,1,3,0,0,0,1,0,0,0,0
4,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,0,3,0,0,0,1,0,0,0,0
