# Feature Engineering:

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
onehot_train = pd.read_csv('data/onehot_train')
onehot_test = pd.read_csv('data/onehot_test')

## OneHot Encoding:

In [3]:
onehot_train.dtypes

respondent_id                    int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance                object
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                        int64
education                      float64
race                            object
sex                      

### Dummy features created for all nominal data columns:

In [4]:
encoded_train = pd.get_dummies(onehot_train)
encoded_test = pd.get_dummies(onehot_test)

In [5]:
encoded_train.columns

Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'sex', 'income_poverty', 'marital_status', 'rent_or_own',
       'employment_status', 'census_msa', 'household_adults',
       'household_children', 'h1n1_vaccine', 'seasonal_vaccine',
       'health_insurance_0.0', 'health_insurance_1.0',
       'health_insurance_unknown', 'race_Black', 'race_Hispanic',
       'race_Other or Multiple', 'race_White', 'hhs_geo_region_atmpeygn',
       'hhs_

In [6]:
encoded_train.to_csv('data/ML_ready_train', index=False)
encoded_test.to_csv('data/ML_ready_test', index=False)

In [7]:
y1 = encoded_train[['seasonal_vaccine']]
y2 = encoded_train[['h1n1_vaccine']]
X = encoded_train.drop(columns=['respondent_id','seasonal_vaccine', 'h1n1_vaccine'])

## Train/Test Split for Seasonal Vaccine:

In [8]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)

In [9]:
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((21365, 93), (5342, 93), (21365, 1), (5342, 1))

## Train/Test Split for H1N1 Vaccine:

In [10]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

In [11]:
X2_train.shape, X2_test.shape, y2_train.shape, y2_test.shape

((21365, 93), (5342, 93), (21365, 1), (5342, 1))

## Saving ML ready data:

In [12]:
X1_train.to_csv('data/X_seasonal_train', index=False)
X1_test.to_csv('data/X_seasonal_test', index=False)
y1_train.to_csv('data/y_seasonal_train', index=False)
y1_test.to_csv('data/y_seasonal_test', index=False)

X2_train.to_csv('data/X_H1N1_train', index=False)
X2_test.to_csv('data/X_H1N1_test', index=False)
y2_train.to_csv('data/y_H1N1_train', index=False)
y2_test.to_csv('data/y_H1N1_test', index=False)