In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('training_set_features.csv')
df.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [5]:
dfc= df.copy()
dfc.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [7]:
colms= ['employment_occupation','respondent_id', 'employment_industry', 'rent_or_own', 'marital_status', 'sex', 'race', 'education', 'opinion_seas_risk', 'opinion_xyz_risk', 'behavioral_touch_face', 'behavioral_outside_home', 'xyz_knowledge']
dfc= dfc.drop(colms, axis=1)
dfc.shape

(26707, 23)

In [9]:
obj_dfc = dfc.select_dtypes(include=['object']).copy()
obj_dfc.head()

Unnamed: 0,age_group,income_poverty,employment_status,hhs_geo_region,census_msa
0,55 - 64 Years,Below Poverty,Not in Labor Force,oxchjgsf,Non-MSA
1,35 - 44 Years,Below Poverty,Employed,bhuqouqj,"MSA, Not Principle City"
2,18 - 34 Years,"<= $75,000, Above Poverty",Employed,qufhixun,"MSA, Not Principle City"
3,65+ Years,Below Poverty,Not in Labor Force,lrircsnp,"MSA, Principle City"
4,45 - 54 Years,"<= $75,000, Above Poverty",Employed,qufhixun,"MSA, Not Principle City"


In [11]:
obj_dfc['employment_status'].fillna('Employed', inplace=True)

In [13]:
obj_dfc["employment_status"].value_counts()

employment_status
Employed              15023
Not in Labor Force    10231
Unemployed             1453
Name: count, dtype: int64

In [15]:
obj_dfc["income_poverty"].value_counts()

income_poverty
<= $75,000, Above Poverty    12777
> $75,000                     6810
Below Poverty                 2697
Name: count, dtype: int64

In [17]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
obj_dfc["age_group"] = ord_enc.fit_transform(obj_dfc[["age_group"]])
obj_dfc["income_poverty"] = ord_enc.fit_transform(obj_dfc[["income_poverty"]])
obj_dfc["employment_status"] = ord_enc.fit_transform(obj_dfc[["employment_status"]])
obj_dfc["hhs_geo_region"] = ord_enc.fit_transform(obj_dfc[["hhs_geo_region"]])
obj_dfc["census_msa"] = ord_enc.fit_transform(obj_dfc[["census_msa"]])
obj_dfc.head()

Unnamed: 0,age_group,income_poverty,employment_status,hhs_geo_region,census_msa
0,3.0,2.0,1.0,8.0,2.0
1,1.0,2.0,0.0,1.0,0.0
2,0.0,0.0,0.0,9.0,0.0
3,4.0,2.0,1.0,5.0,1.0
4,2.0,0.0,0.0,9.0,0.0


In [19]:
obj_dfc["employment_status"].value_counts()

employment_status
0.0    15023
1.0    10231
2.0     1453
Name: count, dtype: int64

In [21]:
obj_dfc["income_poverty"].value_counts()

income_poverty
0.0    12777
1.0     6810
2.0     2697
Name: count, dtype: int64

In [37]:
if obj_dfc["employment_status"].any()==0.0:
     obj_dfc['income_poverty'].fillna(0.0, inplace=True)
if obj_dfc["employment_status"].any()==1.0:
     obj_dfc['income_poverty'].fillna(1.0, inplace=True)
if obj_dfc["employment_status"].any()==2.0:
     obj_dfc['income_poverty'].fillna(2.0, inplace=True)

obj_dfc.isnull().sum()

age_group            0
income_poverty       0
employment_status    0
hhs_geo_region       0
census_msa           0
dtype: int64

In [39]:
a=['age_group' ,'income_poverty' ,'employment_status' ,'hhs_geo_region' ,'census_msa']
dfc= dfc.drop(a,axis = 1)

In [41]:
dfc=dfc.join(obj_dfc)
dfc.head()

Unnamed: 0,xyz_concern,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,doctor_recc_xyz,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,...,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_sick_from_vacc,household_adults,household_children,age_group,income_poverty,employment_status,hhs_geo_region,census_msa
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,0.0,0.0,3.0,2.0,1.0,8.0,2.0
1,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,,,1.0,0.0,...,1.0,4.0,2.0,2.0,0.0,0.0,0.0,0.0,9.0,0.0
3,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,5.0,5.0,1.0,0.0,0.0,4.0,2.0,1.0,5.0,1.0
4,2.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,2.0,3.0,4.0,1.0,0.0,2.0,0.0,0.0,9.0,0.0


In [43]:
dfc['xyz_concern'].fillna(2, inplace=True)

In [45]:
dfc.isnull().sum()

xyz_concern                        0
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_sick_from_vacc      537
household_adults                 249
household_children               249
age_group                          0
income_poverty                     0
employment_status                  0
hhs_geo_region                     0
census_msa                         0
dtype: int64

In [48]:
dfc["xyz_concern"].value_counts()

xyz_concern
2.0    10667
1.0     8153
3.0     4591
0.0     3296
Name: count, dtype: int64

In [52]:
if dfc["xyz_concern"].any()==0.0:
     dfc['doctor_recc_xyz'].fillna(0.0, inplace=True)
     dfc['doctor_recc_seasonal'].fillna(0.0, inplace=True)
if dfc["xyz_concern"].any()==1.0:
     dfc['doctor_recc_xyz'].fillna(0.0, inplace=True)
     dfc['doctor_recc_seasonal'].fillna(0.0, inplace=True)
if dfc["xyz_concern"].any()==2.0:
     dfc['doctor_recc_xyz'].fillna(1.0, inplace=True)
     dfc['doctor_recc_seasonal'].fillna(1.0, inplace=True)
if dfc["xyz_concern"].any()==3.0:
     dfc['doctor_recc_xyz'].fillna(1.0, inplace=True)
     dfc['doctor_recc_seasonal'].fillna(1.0, inplace=True)

dfc.isnull().sum()

xyz_concern                        0
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
doctor_recc_xyz                    0
doctor_recc_seasonal               0
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_sick_from_vacc      537
household_adults                 249
household_children               249
age_group                          0
income_poverty                     0
employment_status                  0
hhs_geo_region                     0
census_msa                         0
dtype: int64

In [56]:
if dfc["income_poverty"].any()==0.0:
     dfc['health_insurance'].fillna(0.0, inplace=True)
if dfc["income_poverty"].any()==1.0:
     dfc['health_insurance'].fillna(1.0, inplace=True)
if dfc["income_poverty"].any()==2.0:
     dfc['health_insurance'].fillna(0.0, inplace=True)

dfc.isnull().sum()

xyz_concern                      0
behavioral_antiviral_meds       71
behavioral_avoidance           208
behavioral_face_mask            19
behavioral_wash_hands           42
behavioral_large_gatherings     87
doctor_recc_xyz                  0
doctor_recc_seasonal             0
chronic_med_condition          971
child_under_6_months           820
health_worker                  804
health_insurance                 0
opinion_xyz_vacc_effective     391
opinion_xyz_sick_from_vacc     395
opinion_seas_vacc_effective    462
opinion_seas_sick_from_vacc    537
household_adults               249
household_children             249
age_group                        0
income_poverty                   0
employment_status                0
hhs_geo_region                   0
census_msa                       0
dtype: int64

In [58]:
dfc = dfc.fillna(df.mode().iloc[0])
dfc.isnull().sum()

xyz_concern                    0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_sick_from_vacc    0
household_adults               0
household_children             0
age_group                      0
income_poverty                 0
employment_status              0
hhs_geo_region                 0
census_msa                     0
dtype: int64

In [60]:
data=pd.read_csv('training_set_labels.csv')
data= data.drop('respondent_id', axis=1)
data.head()

Unnamed: 0,xyz_vaccine,seasonal_vaccine
0,0,0
1,0,1
2,0,0
3,0,1
4,0,0


In [84]:
dfc['xyz_vaccine']=""
dfc['seasonal_vaccine']=""
dfc.head()

Unnamed: 0,xyz_concern,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,doctor_recc_xyz,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,...,opinion_seas_sick_from_vacc,household_adults,household_children,age_group,income_poverty,employment_status,hhs_geo_region,census_msa,xyz_vaccine,seasonal_vaccine
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,3.0,2.0,1.0,8.0,2.0,,
1,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,,
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,2.0,0.0,0.0,0.0,0.0,9.0,0.0,,
3,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,4.0,2.0,1.0,5.0,1.0,,
4,2.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,4.0,1.0,0.0,2.0,0.0,0.0,9.0,0.0,,


In [106]:
X1= dfc.drop(['doctor_recc_seasonal','opinion_seas_vacc_effective','opinion_seas_sick_from_vacc','xyz_vaccine','seasonal_vaccine'], axis=1)
y1=data.xyz_vaccine


In [110]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1, test_size=0.2)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit(X1_train, y1_train)
model.score(X1_test, y1_test)

0.8292774241856983

In [68]:
# from sklearn.neighbors import KNeighborsClassifier
# model2 = KNeighborsClassifier(n_neighbors=4)
# model2.fit(X1_train, y1_train)
# model2.score(X1_test, y1_test)

0.7929614376637963

In [70]:
# from sklearn.svm import SVC  
# clf = SVC(kernel='linear') 
 
# clf.fit(X1, y1) 
# clf.score(X1_test, y1_test)

0.8023212280044927

In [112]:
X2= dfc.drop(['opinion_xyz_vacc_effective','opinion_xyz_sick_from_vacc','xyz_concern','doctor_recc_xyz','xyz_vaccine','seasonal_vaccine'], axis=1)
y2=data.seasonal_vaccine


In [114]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2, test_size=0.2)

from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression()

model2.fit(X2_train, y2_train)
model2.score(X2_test, y2_test)

0.742418569824036

In [82]:
# pred_prob1 = model.predict_proba(X1_test)
# pred_prob2 = model2.predict_proba(X2_test)

# from sklearn.metrics import roc_auc_score

# auc_score1 = roc_auc_score(y_test, pred_prob1)
# auc_score2 = roc_auc_score(y_test, pred_prob2)

# print(auc_score1, auc_score2)

0.5035887964836041 0.4984615047848642


In [118]:
pr1= model.predict_proba(X1)[:,1]
pr2= model2.predict_proba(X2)[:,1]

result= pd.DataFrame({'xyz_vaccine':pr1, 'seasonal_vaccine':pr2})
result

Unnamed: 0,xyz_vaccine,seasonal_vaccine
0,0.061648,0.077037
1,0.215429,0.204436
2,0.048467,0.196504
3,0.073284,0.924353
4,0.047648,0.141935
...,...,...
26702,0.063241,0.564792
26703,0.599566,0.861603
26704,0.130163,0.602732
26705,0.030113,0.015871


In [122]:
result = result.join(df['respondent_id'])
result.head()

Unnamed: 0,xyz_vaccine,seasonal_vaccine,respondent_id
0,0.061648,0.077037,0
1,0.215429,0.204436,1
2,0.048467,0.196504,2
3,0.073284,0.924353,3
4,0.047648,0.141935,4


In [124]:
result.to_csv('result.csv', index=False)