# Cleaning Test Data

In [1]:
import pandas as pd, numpy as np

In [2]:
col_names = ['age','workclass','fnlwgt','education', 'education_num','marital_status',
             'occupation','relationship','race','sex','capital_gain','capital_loss', 
             'hours_per_week','native_country','50k']

In [3]:
df_test_raw_w_nans = pd.read_csv('census-income.test.csv', names = col_names)

In [4]:
df_test_raw = df_test_raw_w_nans.replace(' ?', np.nan)

In [5]:
#df_test_raw = df_test_raw_w_nans.dropna()

In [6]:
df_test_raw.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,50k
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K.


# Encode Target Column 

In [7]:
test_target = pd.get_dummies(df_test_raw).iloc[:,-1]

In [8]:
test_target.head()

0    0
1    0
2    1
3    1
4    0
Name: 50k_ >50K., dtype: uint8

# Unbalanced Data

In [9]:
test_target.value_counts()

0    12435
1     3846
Name: 50k_ >50K., dtype: int64

# Continuous Columns 

age, education_num, fnlwgt, capital_gain, capital_loss, hours_per_week

In [10]:
test_features_raw = df_test_raw.iloc[:,:-1]

In [11]:
len(test_features_raw.columns)

14

In [12]:
df_continuous = pd.concat([test_features_raw.age,
           test_features_raw.fnlwgt,
           test_features_raw.capital_gain,
           test_features_raw.capital_loss,
           test_features_raw.hours_per_week], axis=1)

# Categorical Columns 0's and 1's (No NA's)

In [13]:
test_features_raw = df_test_raw.iloc[:,:-1]

In [14]:
test_features_raw.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States


# native_country

One less column than train data

In [15]:
test_features_raw.native_country.isnull().any()

True

# Impute for native country

In [16]:
test_features_raw.native_country = test_features_raw.native_country.replace(np.nan, ' United-States')

In [17]:
native_country = pd.get_dummies(test_features_raw.native_country)
native_country.head()

len(native_country.columns)

40

In [18]:
missing_country = pd.Series(np.zeros(len(native_country)).astype(int))

In [19]:
native_country.insert(loc = 15, column = ' Holand-Netherlands', value=missing_country)

In [20]:
native_country.iloc[:,15] = int(0)

In [21]:
native_country.isnull().any()

 Cambodia                      False
 Canada                        False
 China                         False
 Columbia                      False
 Cuba                          False
 Dominican-Republic            False
 Ecuador                       False
 El-Salvador                   False
 England                       False
 France                        False
 Germany                       False
 Greece                        False
 Guatemala                     False
 Haiti                         False
 Honduras                      False
 Holand-Netherlands            False
 Hong                          False
 Hungary                       False
 India                         False
 Iran                          False
 Ireland                       False
 Italy                         False
 Jamaica                       False
 Japan                         False
 Laos                          False
 Mexico                        False
 Nicaragua                     False
 

In [22]:
len(native_country.columns)

41

### workclass [missing a column: 'never-worked']

In [23]:
test_features_raw.workclass.isnull().any()

True

In [24]:
test_features_raw.workclass = test_features_raw.workclass.replace(np.nan, ' Private')

In [25]:
workclass = pd.get_dummies(test_features_raw.workclass) 

len(workclass.columns)

8

In [26]:
#missing_workclass = pd.Series(np.zeros(len(workclass)).astype(int))
#workclass.insert(loc = 2, column = ' Never-worked', value=missing_workclass)
#workclass.iloc[:,2] = int(0)

"""Imputing native_country nans returns ' Never-worked' to us"""

"Imputing native_country nans returns ' Never-worked' to us"

In [27]:
workclass.head()

Unnamed: 0,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
0,0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0,0


### occupation (Come back to this later)

In [39]:
occupation = pd.read_csv('occupation_imputed.csv', names = ['occupation'], index_col = 0)

In [41]:
occupation = pd.get_dummies(occupation)

In [44]:
occupation.columns = [' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv']

In [45]:
occupation.head()

Unnamed: 0,Adm-clerical,Exec-managerial,Handlers-cleaners,Prof-specialty,Other-service,Sales,Craft-repair,Transport-moving,Farming-fishing,Machine-op-inspct,Tech-support,Protective-serv,Armed-Forces,Priv-house-serv
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0


### marital

In [46]:
marital_status = pd.get_dummies(test_features_raw.marital_status)
marital_status.head()

len(marital_status.columns)

7

### relationship 

In [47]:
relationship = pd.get_dummies(test_features_raw.relationship)
relationship.head()

len(relationship.columns)

6

### race

In [48]:
race = pd.get_dummies(test_features_raw.iloc[:,8])
race.head()

len(race.columns)

5

### sex

In [49]:
sex = pd.get_dummies(test_features_raw.sex)
sex.head() # Male = 1, Female = 0 

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


# Put dataset together

In [64]:
df1 = df_continuous.merge(marital_status, left_index=True, right_index = True)

In [65]:
df2 = df1.merge(relationship, left_index=True, right_index = True)

In [66]:
df3 = df2.merge(race, left_index=True, right_index = True)

In [67]:
df4 = df3.merge(sex, left_index=True, right_index = True)

In [68]:
df5 = df4.merge(native_country, left_index=True, right_index = True)

In [69]:
df6 = df5.merge(workclass, left_index=True, right_index = True)

In [73]:
df_test_imputed = df6.merge(occupation, left_index=True, right_index = True)

In [74]:
len(df_test_imputed.columns)

88

### Should we include education as hot encoded?

df8 = df7.merge(education, left_index=True, right_index = True)

In [75]:
df_test_imputed.insert(loc=88, column = '>50k', value =test_target)
len(df_w_nans.columns)

89

In [77]:
df_test_imputed.to_csv('test_simple_imputation.csv')