# Mode_Imputation_Test_Data

Replace all missing values with most frequent value in the column

In [1]:
import pandas as pd, numpy as np

In [2]:
col_names = ['age','workclass','fnlwgt','education', 'education_num','marital_status',
             'occupation','relationship','race','sex','capital_gain','capital_loss', 
             'hours_per_week','native_country','50k']

In [3]:
df_test_raw = pd.read_csv('census-income.test.csv', names = col_names)

In [13]:
df_test_raw.replace(' ?', np.nan, inplace = True)

In [14]:
df_test_raw.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,50k
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K.


# Encode Target Column 

In [15]:
test_target = pd.get_dummies(df_test_raw).iloc[:,-1]

In [16]:
test_target.head()

0    0
1    0
2    1
3    1
4    0
Name: 50k_ >50K., dtype: uint8

# Unbalanced Data

In [17]:
test_target.value_counts()

0    12435
1     3846
Name: 50k_ >50K., dtype: int64

# Continuous Columns 

age, education_num, fnlwgt, capital_gain, capital_loss, hours_per_week

In [18]:
test_features_raw = df_test_raw.iloc[:,:-1]

In [19]:
len(test_features_raw.columns)

14

In [20]:
df_continuous = pd.concat([test_features_raw.age,
           test_features_raw.fnlwgt,
           test_features_raw.capital_gain,
           test_features_raw.capital_loss,
           test_features_raw.hours_per_week], axis=1)

# Categorical Columns 0's and 1's (No NA's)

In [21]:
test_features_raw = df_test_raw.iloc[:,:-1]

In [22]:
test_features_raw.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States


### native_country 

In [105]:
test_features_raw.native_country.value_counts()

 United-States                 14936
 Mexico                          308
 Philippines                      97
 Puerto-Rico                      70
 Germany                          69
 Canada                           61
 India                            51
 El-Salvador                      49
 China                            47
 Cuba                             43
 England                          37
 South                            35
 Dominican-Republic               33
 Italy                            32
 Haiti                            31
 Portugal                         30
 Japan                            30
 Poland                           27
 Columbia                         26
 Jamaica                          25
 Guatemala                        24
 Greece                           20
 Vietnam                          19
 Ecuador                          17
 Iran                             16
 Nicaragua                        15
 Peru                             15
 

In [106]:
test_features_raw.native_country.replace(np.nan, ' United-States', inplace = True)

In [107]:
native_country = pd.get_dummies(test_features_raw.native_country)
native_country.head()

Unnamed: 0,Cambodia,Canada,China,Columbia,Cuba,Dominican-Republic,Ecuador,El-Salvador,England,France,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [108]:
missing_country = pd.Series(np.zeros(len(test_features_raw), dtype = int))

In [109]:
native_country.insert(loc = 14, column = ' Holand-Netherlands', value=missing_country)

In [110]:
len(native_country.columns)

41

### workclass 

In [195]:
test_features_raw.workclass.value_counts()

 Self-emp-not-inc    13494
 Local-gov            1043
 State-gov             683
 Self-emp-inc          579
 Federal-gov           472
 Without-pay             7
 Never-worked            3
Name: workclass, dtype: int64

In [196]:
test_features_raw.workclass.replace(np.nan, ' Self-emp-not-inc', inplace = True)

In [197]:
missing_workclass = pd.Series(np.zeros(len(test_features_raw),dtype=int))

In [198]:
workclass = pd.get_dummies(test_features_raw.workclass) 

In [199]:
workclass.insert(loc = 3, column = ' Private', value = missing_workclass)

In [200]:
workclass.head()

Unnamed: 0,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0


In [201]:
len(workclass.columns)

8

### occupation

In [202]:
test_features_raw.occupation.value_counts()

 Prof-specialty       2998
 Exec-managerial      2020
 Craft-repair         2013
 Sales                1854
 Adm-clerical         1841
 Other-service        1628
 Machine-op-inspct    1020
 Transport-moving      758
 Handlers-cleaners     702
 Tech-support          518
 Farming-fishing       496
 Protective-serv       334
 Priv-house-serv        93
 Armed-Forces            6
Name: occupation, dtype: int64

In [203]:
test_features_raw.occupation.replace(np.nan, ' Prof-specialty', inplace = True)

In [204]:
occupation = pd.get_dummies(test_features_raw.occupation) # no ?

occupation.head()

Unnamed: 0,Adm-clerical,Armed-Forces,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [205]:
len(occupation.columns)

14

### marital

In [206]:
marital_status = pd.get_dummies(test_features_raw.marital_status)
marital_status.head()

len(marital_status.columns)

7

### relationship 

In [207]:
relationship = pd.get_dummies(test_features_raw.relationship)
relationship.head()

len(relationship.columns)

6

### race

In [208]:
race = pd.get_dummies(test_features_raw.iloc[:,8])
race.head()

len(race.columns)

5

### sex

In [209]:
sex = pd.get_dummies(test_features_raw.iloc[:,9])
sex.head() # Male = 1, Female = 0 

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


In [210]:
len(native_country.columns)

41

# Put dataset together

In [211]:
df1 = df_continuous.merge(sex, left_index=True, right_index = True)

In [212]:
df2 = df1.merge(race, left_index=True, right_index = True)

In [213]:
df3 = df2.merge(relationship, left_index=True, right_index = True)

In [214]:
df4 = df3.merge(marital_status, left_index=True, right_index = True)

In [215]:
df5 = df4.merge(native_country, left_index=True, right_index = True)

In [216]:
df6 = df5.merge(workclass, left_index=True, right_index = True)

In [217]:
df_mode = df6.merge(occupation, left_index=True, right_index = True)

In [218]:
len(df_mode.columns)

88

# Insert Target Variable

In [219]:
df_mode.insert(loc=88, column = '>50k', value =test_target)

In [220]:
len(df_mode.columns)

89

In [221]:
df_mode.columns = [x.strip() for x in df_mode.columns]

In [228]:
df_mode.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Female,Male,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,...,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving,>50k
0,25,226802,0,0,40,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,38,89814,0,0,50,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28,336951,0,0,40,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,44,160323,7688,0,40,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,1
4,18,103497,0,0,30,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [229]:
df_mode.to_csv('test_mode_imp.csv')

In [222]:
b = list(df_mode.columns)

In [223]:
a = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week',
       'Female', 'Male', 'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black',
       'Other', 'White', 'Husband', 'Not-in-family', 'Other-relative',
       'Own-child', 'Unmarried', 'Wife', 'Divorced', 'Married-AF-spouse',
       'Married-civ-spouse', 'Married-spouse-absent', 'Never-married',
       'Separated', 'Widowed', 'Cambodia', 'Canada', 'China', 'Columbia',
       'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England',
       'France', 'Germany', 'Greece', 'Guatemala', 'Haiti',
       'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran',
       'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua',
       'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland',
       'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand',
       'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia',
       'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc',
       'Self-emp-not-inc', 'State-gov', 'Without-pay', 'Adm-clerical',
       'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing',
       'Handlers-cleaners', 'Machine-op-inspct', 'Other-service',
       'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales',
       'Tech-support', 'Transport-moving', '>50k']

In [224]:
for i in range(len(a)):
    if a[i] != b[i]:
        print(i)

In [227]:
a == b

True