# Cleaning Test Data

In [1]:
import pandas as pd, numpy as np

In [2]:
col_names = ['age','workclass','fnlwgt','education', 'education_num','marital_status',
             'occupation','relationship','race','sex','capital_gain','capital_loss', 
             'hours_per_week','native_country','50k']

In [3]:
df_test_raw_w_nans = pd.read_csv('census-income.test.csv', names = col_names)

In [4]:
df_test_raw_w_nans= df_test_raw_w_nans.replace(' ?', np.nan)

In [5]:
df_test_raw = df_test_raw_w_nans.dropna()

In [6]:
df_test_raw.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,50k
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.


# Encode Target Column 

In [7]:
test_target = pd.get_dummies(df_test_raw).iloc[:,-1]

In [8]:
test_target.head()

0    0
1    0
2    1
3    1
5    0
Name: 50k_ >50K., dtype: uint8

# Unbalanced Data

In [9]:
test_target.value_counts()

0    11360
1     3700
Name: 50k_ >50K., dtype: int64

# Continuous Columns 

age, education_num, fnlwgt, capital_gain, capital_loss, hours_per_week

In [10]:
test_features_raw = df_test_raw.iloc[:,:-1]

In [11]:
len(test_features_raw.columns)

14

In [12]:
df_continuous = pd.concat([test_features_raw.age,
           test_features_raw.fnlwgt,
           test_features_raw.capital_gain,
           test_features_raw.capital_loss,
           test_features_raw.hours_per_week], axis=1)

# Categorical Columns 0's and 1's (No NA's)

In [13]:
test_features_raw = df_test_raw.iloc[:,:-1]

In [14]:
test_features_raw.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States


# native_country

One less column than train data

In [15]:
df_test_raw_w_nans.native_country.isnull().any()

True

In [16]:
native_country = pd.get_dummies(test_features_raw.native_country)
native_country.head()

len(native_country.columns)

40

In [17]:
missing_country = pd.Series(np.zeros(len(native_country)).astype(int))

In [18]:
native_country.insert(loc = 15, column = ' Holand-Netherlands', value=missing_country)

In [84]:
native_country.iloc[:,15] = int(0)

In [96]:
native_country.isnull().any()

 Cambodia                      False
 Canada                        False
 China                         False
 Columbia                      False
 Cuba                          False
 Dominican-Republic            False
 Ecuador                       False
 El-Salvador                   False
 England                       False
 France                        False
 Germany                       False
 Greece                        False
 Guatemala                     False
 Haiti                         False
 Honduras                      False
 Holand-Netherlands            False
 Hong                          False
 Hungary                       False
 India                         False
 Iran                          False
 Ireland                       False
 Italy                         False
 Jamaica                       False
 Japan                         False
 Laos                          False
 Mexico                        False
 Nicaragua                     False
 

In [19]:
len(native_country.columns)

41

### workclass [missing a column: 'never-worked']

In [20]:
workclass = pd.get_dummies(test_features_raw.workclass) 

len(workclass.columns)

7

In [22]:
missing_workclass = pd.Series(np.zeros(len(workclass)).astype(int))

In [23]:
workclass.insert(loc = 2, column = ' Never-worked', value=missing_workclass)

In [71]:
workclass.iloc[:,2] = int(0)

In [72]:
workclass.head()

Unnamed: 0,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
0,0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0
5,0,0,0,1,0,0,0,0


### occupation

In [25]:
occupation = pd.get_dummies(test_features_raw.occupation) # no ?
occupation.head()

len(occupation.columns)

14

In [26]:
list(occupation.columns)

[' Adm-clerical',
 ' Armed-Forces',
 ' Craft-repair',
 ' Exec-managerial',
 ' Farming-fishing',
 ' Handlers-cleaners',
 ' Machine-op-inspct',
 ' Other-service',
 ' Priv-house-serv',
 ' Prof-specialty',
 ' Protective-serv',
 ' Sales',
 ' Tech-support',
 ' Transport-moving']

### marital

In [27]:
marital_status = pd.get_dummies(test_features_raw.marital_status)
marital_status.head()

len(marital_status.columns)

7

### relationship 

In [28]:
relationship = pd.get_dummies(test_features_raw.relationship)
relationship.head()

len(relationship.columns)

6

### race

In [29]:
race = pd.get_dummies(test_features_raw.iloc[:,8])
race.head()

len(race.columns)

5

### sex

In [30]:
sex = pd.get_dummies(test_features_raw.sex)
sex.head() # Male = 1, Female = 0 

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,0,1
3,0,1
5,0,1


# Put dataset together

In [97]:
df1 = df_continuous.merge(marital_status, left_index=True, right_index = True)

In [98]:
df2 = df1.merge(relationship, left_index=True, right_index = True)

In [99]:
df3 = df2.merge(race, left_index=True, right_index = True)

In [100]:
df4 = df3.merge(sex, left_index=True, right_index = True)

In [101]:
df5 = df4.merge(native_country, left_index=True, right_index = True)

In [102]:
df6 = df5.merge(workclass, left_index=True, right_index = True)

In [103]:
df_no_nans = df6.merge(occupation, left_index=True, right_index = True)

### Should we include education as hot encoded?

df8 = df7.merge(education, left_index=True, right_index = True)

In [104]:
df_no_nans.insert(loc=88, column = '>50k', value =test_target)

In [105]:
len(df_no_nans.columns)

89

In [106]:
df_no_nans.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,...,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving,>50k
0,25,226802,0,0,40,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,38,89814,0,0,50,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28,336951,0,0,40,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
3,44,160323,7688,0,40,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
5,34,198693,0,0,30,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [107]:
df_no_nans.to_csv('test_simple_imputation.csv')