In [2]:
!pip install pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
from helper_functions import number_na, visualize_unique_values, trans_str_to_int
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.metrics import accuracy_score, hamming_loss, log_loss, f1_score, roc_auc_score, roc_curve, auc, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import warnings
warnings.filterwarnings("ignore")



### Viewing Data, handling categorical data, Imputing missing values

##### Intital overwiew of data


In [3]:
#view top rows of data
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df = pd.read_csv("H1N1_Flu_Vaccines.csv")
print(df.shape)
features = df.iloc[:,:-2]
features.head()

(26707, 38)


Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [4]:
#extract the features as a list 
features_list = []
for col in df.columns:
    features_list.append(col)
    
#extract targets as a list
labels_list = features_list[-2:]
features_list = features_list[:-2]

#view
print(features_list[1:]) #35
print('\n')
print(labels_list) #2

['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'household_adults', 'household_children', 'employment_industry', 'employment_occupation']


['h1n1_vaccine', 'seasonal_vaccine']


In [25]:
# looking at intial counts of target values

# extracting the two target variables
targets = df.iloc[:,36:38]

count_h1n1_vaccine = df[labels_list[0]].value_counts()
count_seasonal_vaccine = df[labels_list[1]].value_counts()

#No = 0
#Yes = 1
print(count_h1n1_vaccine)
print(count_seasonal_vaccine)

print("\n")

#percentages
print(f"didn't get h1n1 vaccine {(count_h1n1_vaccine[0])/len(df)*100:.2f}%")
print(f"didn't get seasonal vaccine {(count_seasonal_vaccine [0])/len(df)*100:.2f}%")



0    21033
1     5674
Name: h1n1_vaccine, dtype: int64
0    14272
1    12435
Name: seasonal_vaccine, dtype: int64


didn't get h1n1 vaccine 78.75%
didn't get seasonal vaccine 53.44%


Note there is an uneven class distribution for H1N1. This needs to be delt with when training models via oversampling the training sets.


#### Checking dtypes in the data set and null-counts via the helper_function

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [7]:
#Checking initial summary statistics
print(df.describe().transpose())

                                count      mean      std   min      25%  \
respondent_id               26707.000 13353.000 7709.791 0.000 6676.500   
h1n1_concern                26615.000     1.618    0.910 0.000    1.000   
h1n1_knowledge              26591.000     1.263    0.618 0.000    1.000   
behavioral_antiviral_meds   26636.000     0.049    0.216 0.000    0.000   
behavioral_avoidance        26499.000     0.726    0.446 0.000    0.000   
behavioral_face_mask        26688.000     0.069    0.253 0.000    0.000   
behavioral_wash_hands       26665.000     0.826    0.379 0.000    1.000   
behavioral_large_gatherings 26620.000     0.359    0.480 0.000    0.000   
behavioral_outside_home     26625.000     0.337    0.473 0.000    0.000   
behavioral_touch_face       26579.000     0.677    0.468 0.000    0.000   
doctor_recc_h1n1            24547.000     0.220    0.414 0.000    0.000   
doctor_recc_seasonal        24547.000     0.330    0.470 0.000    0.000   
chronic_med_condition    

In [8]:
#checking for duplicates
df.duplicated().sum()

0

In [9]:
#Using number_na function from helper_functions.py
check_na = df
number_na = number_na(check_na)
number_na

Unnamed: 0,Count of NaN,Percentage (%) of NaN
respondent_id,0,0.0
h1n1_concern,92,0.344
h1n1_knowledge,116,0.434
behavioral_antiviral_meds,71,0.266
behavioral_avoidance,208,0.779
behavioral_face_mask,19,0.071
behavioral_wash_hands,42,0.157
behavioral_large_gatherings,87,0.326
behavioral_outside_home,82,0.307
behavioral_touch_face,128,0.479


Notice that certain classes have very high percentage of NaNs. These need to be taken care of.

In [10]:
#extract the columns which have over 20% NaNs
more_20_na = number_na.loc[number_na['Percentage (%) of NaN']>20]
more_20_na
#print(more_20_na.index)

Unnamed: 0,Count of NaN,Percentage (%) of NaN
health_insurance,12274,45.958
employment_industry,13330,49.912
employment_occupation,13470,50.436


In [11]:
#filling in missing values for those under 20% NaNs

#fill in categorical with mode
categ = df.select_dtypes(include='object').columns
print(categ)

for column in categ:
    if column not in more_20_na.index:
        #print(column)
        df[column].fillna(df[column].mode()[0], inplace=True)

#fill in numerical with median
numer = df.select_dtypes(include=['int64','float64']).columns
print(numer)

for column in numer:
    if column not in more_20_na.index:
        if column != 'respondent_id':
        #print(column)
            df[column].fillna(df[column].median(),inplace=True)

#check again for missing values
df.isnull().sum()

Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa', 'employment_industry', 'employment_occupation'],
      dtype='object')
Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children', 'h1n1_vaccine', 'seasonal_vaccine'],
      dtype='object')


respondent_id                      0
h1n1_concern                       0
h1n1_knowledge                     0
behavioral_antiviral_meds          0
behavioral_avoidance               0
behavioral_face_mask               0
behavioral_wash_hands              0
behavioral_large_gatherings        0
behavioral_outside_home            0
behavioral_touch_face              0
doctor_recc_h1n1                   0
doctor_recc_seasonal               0
chronic_med_condition              0
child_under_6_months               0
health_worker                      0
health_insurance               12274
opinion_h1n1_vacc_effective        0
opinion_h1n1_risk                  0
opinion_h1n1_sick_from_vacc        0
opinion_seas_vacc_effective        0
opinion_seas_risk                  0
opinion_seas_sick_from_vacc        0
age_group                          0
education                          0
race                               0
sex                                0
income_poverty                     0
m

In [12]:
# imputation of missing values for health_insurance, employment_industry, employment_occupation (> 20% NaNs)

# making a new category for health_insurance: 2 = missing info
df['health_insurance'].fillna(value = 2 , inplace=True)

# employment status = 'unemployed' or 'Not in Labor Force' --> employment_industry and employment_occupation = "unemployed"
df.loc[df['employment_status'] == 'Unemployed', 'employment_industry'] = 'unemployed'
df.loc[df['employment_status'] == 'Not in Labor Force', 'employment_industry'] = 'unemployed'

df.loc[df['employment_status'] == 'Unemployed', 'employment_occupation'] = 'unemployed'
df.loc[df['employment_status'] == 'Not in Labor Force', 'employment_occupation'] = 'unemployed'
df['employment_status'].astype('object')

df.isnull().sum()

respondent_id                     0
h1n1_concern                      0
h1n1_knowledge                    0
behavioral_antiviral_meds         0
behavioral_avoidance              0
behavioral_face_mask              0
behavioral_wash_hands             0
behavioral_large_gatherings       0
behavioral_outside_home           0
behavioral_touch_face             0
doctor_recc_h1n1                  0
doctor_recc_seasonal              0
chronic_med_condition             0
child_under_6_months              0
health_worker                     0
health_insurance                  0
opinion_h1n1_vacc_effective       0
opinion_h1n1_risk                 0
opinion_h1n1_sick_from_vacc       0
opinion_seas_vacc_effective       0
opinion_seas_risk                 0
opinion_seas_sick_from_vacc       0
age_group                         0
education                         0
race                              0
sex                               0
income_poverty                    0
marital_status              

In [15]:
# now fill in the final missing values in employment_industry and employment occupation with mode

for column in more_20_na.index[1:]:
    df[column].fillna(df[column].mode()[0], inplace=True)
    
#check again missing values
df.isnull().sum()

respondent_id                  0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re