In [28]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [25]:
pd.set_option("display.max_columns", 100)

# Import data

In [26]:
train_features = pd.read_csv('training_set_features.csv',index_col="respondent_id")
test_features = pd.read_csv('test_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv',index_col="respondent_id")

# Explore data

In [27]:
train_features.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,1.0,0.0,0.0,,3.0,1.0,1.0,4.0,1.0,2.0,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,3.0,3.0,5.0,5.0,4.0,1.0,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,2.0,3.0,1.0,4.0,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [7]:
train_features['respondent_id'].is_unique

True

Concern is probably a highly important feature.

In [9]:
train_features['h1n1_concern'].value_counts()

2.0    10575
1.0     8153
3.0     4591
0.0     3296
Name: h1n1_concern, dtype: int64

In [10]:
train_features['h1n1_knowledge'].value_counts()

1.0    14598
2.0     9487
0.0     2506
Name: h1n1_knowledge, dtype: int64

Correlation concern and knowledge.

In [12]:
train_features[['h1n1_knowledge','h1n1_concern']].corr(method='pearson')

Unnamed: 0,h1n1_knowledge,h1n1_concern
h1n1_knowledge,1.0,0.062522
h1n1_concern,0.062522,1.0


# Variables

Overview variables:

1) Categorical var (numbered):
- h1n1_concern
- h1n1_knowledge
- opinion_h1n1_vacc_effective
- opinion_h1n1_risk
- opinion_h1n1_sick_from_vacc
- opinion_seas_vacc_effective
- opinion_seas_risk
- opinion_seas_sick_from_vacc

2) Categorical var (text):
- age_group	55 - 64 Years
- education	< 12 Years
- race	White
- sex	Female
- income_poverty	Below Poverty
- marital_status	Not Married
- rent_or_own	Own
- employment_status	Not in Labor Force
- hhs_geo_region	oxchjgsf
- census_msa	Non-MSA
- household_adults	0
- household_children	0
- employment_industry	NaN
- employment_occupation	NaN

3) Binary:
- behavioral_antiviral_meds	0
- behavioral_avoidance	0
- behavioral_face_mask	0
- behavioral_wash_hands	0
- behavioral_large_gatherings	0
- behavioral_outside_home	1
- behavioral_touch_face	1
- doctor_recc_h1n1	0
- doctor_recc_seasonal	0
- chronic_med_condition	0
- child_under_6_months	0
- health_worker	0
- health_insurance	1

In [13]:
pd.crosstab(train_features['h1n1_concern'], 
            train_features['h1n1_knowledge'],  
            margins = False) 

h1n1_knowledge,0.0,1.0,2.0
h1n1_concern,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,663,1534,1071
1.0,581,4871,2673
2.0,639,5913,3994
3.0,574,2257,1739


# Simple model with binary variables

In [18]:
training_feat_simple = train_features.iloc[:, 3:16]
training_feat_simple.head()

Unnamed: 0,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance
0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,1.0,0.0,0.0,
3,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,
4,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,


In [19]:
training_feat_simple.isnull().sum()

behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
dtype: int64

In [24]:
len(training_feat_simple)

26707