# 1. Imputation Country

Generate data sets to predict for missing values in native_country

In [125]:
import pandas as pd, numpy as np

In [126]:
col_names = ['age','workclass','fnlwgt','education', 'education_num','marital_status',
             'occupation','relationship','race','sex','capital_gain','capital_loss', 
             'hours_per_week','native_country','50k']

In [127]:
df_train_raw = pd.read_csv('census-income.data.csv', names = col_names)

In [128]:
df_train_raw.replace(' ?', np.nan, inplace=True)

In [129]:
df_train_raw.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Encode Target Column 

In [130]:
train_target = pd.get_dummies(df_train_raw).iloc[:,-1]

In [131]:
train_target.head()

0    0
1    0
2    0
3    0
4    0
Name: 50k_ >50K, dtype: uint8

# Unbalanced Data

In [132]:
train_target.value_counts()

0    24720
1     7841
Name: 50k_ >50K, dtype: int64

# Continuous Columns 

age, education_num, fnlwgt, capital_gain, capital_loss, hours_per_week

In [133]:
train_features_raw = df_train_raw.iloc[:,:-1]

In [134]:
len(train_features_raw.columns)

14

### Education

In [135]:
train_features_raw.native_country.isnull().values.any()

True

In [136]:
missing_country = list(train_features_raw.native_country[train_features_raw.native_country.isnull()].index)

In [53]:
native_country_int, native_country_str = pd.factorize(train_features_raw.native_country)
native_country_int = pd.Series(native_country_int)
native_country_or = train_features_raw.native_country

In [137]:
len(missing_country)

583

In [54]:
native_country_int = native_country_int.replace(-1, np.nan)

# Continuous

In [55]:
df_continuous = pd.concat([train_features_raw.age,
           train_features_raw.fnlwgt,
           train_features_raw.capital_gain,
           train_features_raw.capital_loss,
           train_features_raw.hours_per_week], axis=1)

# Categorical Columns 0's and 1's (No NA's)

In [56]:
train_features_raw = df_train_raw.iloc[:,:-1]

In [57]:
train_features_raw.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


### Workclass (contains missing values!)

In [58]:
workclass = pd.get_dummies(train_features_raw.workclass) # no ?
workclass.iloc[:,0].replace(1, np.nan, inplace=True)
workclass.head()

len(workclass.columns)

8

### marital

In [59]:
marital_status = pd.get_dummies(train_features_raw.marital_status)
marital_status.head()

len(marital_status.columns)

7

### Occupation (contains missing values!)

In [60]:
occupation = pd.get_dummies(train_features_raw.occupation) # no ?
occupation.iloc[:,0].replace(1, np.nan, inplace=True)
occupation.head()

len(occupation.columns)

14

### Relationship 

In [61]:
relationship = pd.get_dummies(train_features_raw.relationship)
relationship.head()

len(relationship.columns)

6

### Race

In [62]:
race = pd.get_dummies(train_features_raw.iloc[:,8])
race.head()

len(race.columns)

5

### Sex

In [63]:
sex = pd.get_dummies(train_features_raw.iloc[:,9])
sex.head() # Male = 1, Female = 0 

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


# Put dataset together

In [77]:
df1 = df_continuous.merge(workclass, left_index=True, right_index = True)

In [78]:
df2 = df1.merge(marital_status, left_index=True, right_index = True)

In [79]:
df3 = df2.merge(occupation, left_index=True, right_index = True)

In [80]:
df4 = df3.merge(relationship, left_index=True, right_index = True)

In [81]:
df5 = df4.merge(race, left_index=True, right_index = True)

In [82]:
df_w_nans = df5.merge(sex, left_index=True, right_index = True)

In [83]:
len(df_w_nans.columns)

47

### Should we include education as hot encoded?

In [84]:
df_w_nans.insert(loc=47, column = '>50k', value =train_target)

In [85]:
df_w_nans.insert(loc=48, column = 'native_country_int', value =native_country_int)

In [86]:
df_w_nans.insert(loc=49, column = 'native_country_or', value =native_country_or)

In [87]:
df_w_nans.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,...,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male,>50k,native_country_int,native_country_or
0,39,77516,2174,0,40,0.0,0,0,0,0,...,0,0,0,0,1,0,1,0,0.0,United-States
1,50,83311,0,0,13,0.0,0,0,0,0,...,0,0,0,0,1,0,1,0,0.0,United-States
2,38,215646,0,0,40,0.0,0,0,1,0,...,0,0,0,0,1,0,1,0,0.0,United-States
3,53,234721,0,0,40,0.0,0,0,1,0,...,0,0,1,0,0,0,1,0,0.0,United-States
4,28,338409,0,0,40,0.0,0,0,1,0,...,0,0,1,0,0,1,0,0,1.0,Cuba


# Drop Instances of NaN

Variables that contain Missing Values: Employer, Profession, Nationality

In [88]:
df_no_nans = df_w_nans.dropna()
df_no_nans.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,...,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male,>50k,native_country_int,native_country_or
1,50,83311,0,0,13,0.0,0,0,0,0,...,0,0,0,0,1,0,1,0,0.0,United-States
2,38,215646,0,0,40,0.0,0,0,1,0,...,0,0,0,0,1,0,1,0,0.0,United-States
3,53,234721,0,0,40,0.0,0,0,1,0,...,0,0,1,0,0,0,1,0,0.0,United-States
4,28,338409,0,0,40,0.0,0,0,1,0,...,0,0,1,0,0,1,0,0,1.0,Cuba
5,37,284582,0,0,40,0.0,0,0,1,0,...,0,0,0,0,1,1,0,0,0.0,United-States


In [92]:
len(df_no_nans.columns)

50

In [90]:
'{} instances are lost when we drop all NaNs'.format(len(df_w_nans) - len(df_no_nans))

'4931 instances are lost when we drop all NaNs'

In [91]:
df_no_nans = df_no_nans.rename(index=str, columns={0: "native_country"})
df_no_nans = df_no_nans.rename(columns=lambda x: x.strip())

In [104]:
df_no_nans.native_country_int = df_no_nans.native_country_int.astype(int)
df_no_nans['Federal-gov'] = df_no_nans['Federal-gov'].astype(int)
df_no_nans['Adm-clerical'] = df_no_nans['Adm-clerical'].astype(int)

In [107]:
df_no_nans.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,...,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male,>50k,native_country_int,native_country_or
1,50,83311,0,0,13,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,United-States
2,38,215646,0,0,40,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,United-States
3,53,234721,0,0,40,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,United-States
4,28,338409,0,0,40,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,Cuba
5,37,284582,0,0,40,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,United-States


In [108]:
for i in df_no_nans.columns:
    print(i)

age
fnlwgt
capital_gain
capital_loss
hours_per_week
Federal-gov
Local-gov
Never-worked
Private
Self-emp-inc
Self-emp-not-inc
State-gov
Without-pay
Divorced
Married-AF-spouse
Married-civ-spouse
Married-spouse-absent
Never-married
Separated
Widowed
Adm-clerical
Armed-Forces
Craft-repair
Exec-managerial
Farming-fishing
Handlers-cleaners
Machine-op-inspct
Other-service
Priv-house-serv
Prof-specialty
Protective-serv
Sales
Tech-support
Transport-moving
Husband
Not-in-family
Other-relative
Own-child
Unmarried
Wife
Amer-Indian-Eskimo
Asian-Pac-Islander
Black
Other
White
Female
Male
>50k
native_country_int
native_country_or


In [109]:
len(df_no_nans.columns)

50

In [110]:
df_no_nans.to_csv('train_country_predict.csv')

In [120]:
df_nans = df_w_nans.iloc[missing_country]

In [122]:
df_nans.to_csv('test_country_predict.csv')