<a href="https://colab.research.google.com/github/mohanpartha/ML_preprocessing/blob/master/flu_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [97]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
df_train = pd.read_csv('/content/drive/My Drive/data/flu/training_set_features.csv',sep=',')
df_test = pd.read_csv('/content/drive/My Drive/data/flu/test_set_features.csv',sep=',')
df_y = pd.read_csv('/content/drive/My Drive/data/flu/training_set_labels.csv',sep=',')

In [99]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [0]:
df_train.set_index('respondent_id',inplace=True)

In [0]:
discard_cols = ["employment_industry", "employment_occupation", "health_insurance"]

In [0]:
df_train.drop(columns = discard_cols, inplace=True)

In [103]:
df_train['h1n1_concern'].value_counts()

2.0    10575
1.0     8153
3.0     4591
0.0     3296
Name: h1n1_concern, dtype: int64

In [104]:
df_train['child_under_6_months'].value_counts()

0.0    23749
1.0     2138
Name: child_under_6_months, dtype: int64

In [105]:
df_train['age_group'].value_counts()

65+ Years        6843
55 - 64 Years    5563
45 - 54 Years    5238
18 - 34 Years    5215
35 - 44 Years    3848
Name: age_group, dtype: int64

In [106]:
df_train['opinion_h1n1_vacc_effective'].value_counts()

4.0    11683
5.0     7166
3.0     4723
2.0     1858
1.0      886
Name: opinion_h1n1_vacc_effective, dtype: int64

In [0]:
fill_na_values = df_train.mode().iloc[0]

In [108]:
fill_na_values

h1n1_concern                                           2
h1n1_knowledge                                         1
behavioral_antiviral_meds                              0
behavioral_avoidance                                   1
behavioral_face_mask                                   0
behavioral_wash_hands                                  1
behavioral_large_gatherings                            0
behavioral_outside_home                                0
behavioral_touch_face                                  1
doctor_recc_h1n1                                       0
doctor_recc_seasonal                                   0
chronic_med_condition                                  0
child_under_6_months                                   0
health_worker                                          0
opinion_h1n1_vacc_effective                            4
opinion_h1n1_risk                                      2
opinion_h1n1_sick_from_vacc                            2
opinion_seas_vacc_effective    

In [0]:
df_train = df_train.fillna(fill_na_values)

In [0]:
dummy_cols = ["age_group", "education", 
              "race", "sex", 
              "income_poverty", "marital_status",
             "rent_or_own", "employment_status",
              "hhs_geo_region", "census_msa",
              "h1n1_concern", "h1n1_knowledge",
              "opinion_h1n1_vacc_effective",
              "opinion_h1n1_risk","opinion_h1n1_sick_from_vacc",
              "opinion_seas_vacc_effective","opinion_seas_risk"
             ]

In [0]:
df_X = pd.concat([df_train.drop(columns=dummy_cols), 
                pd.get_dummies(df_train[dummy_cols]),], axis=1)

In [0]:
logreg_all_h1n1 = LogisticRegression(solver='liblinear', random_state=3)
logreg_all_season = LogisticRegression(solver='liblinear', random_state=4)

from sklearn.metrics import accuracy_score, confusion_matrix

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(df_X, df_y, test_size=0.2)

In [114]:
print(X_train.shape, X_valid.shape, y_train, y_valid)

(21365, 60) (5342, 60)        respondent_id  h1n1_vaccine  seasonal_vaccine
20649          20649             0                 0
20687          20687             0                 0
1130            1130             0                 0
22662          22662             0                 0
9800            9800             0                 0
...              ...           ...               ...
4368            4368             0                 0
17836          17836             1                 0
12981          12981             1                 1
1755            1755             0                 0
780              780             1                 1

[21365 rows x 3 columns]        respondent_id  h1n1_vaccine  seasonal_vaccine
10902          10902             0                 0
21239          21239             0                 1
5320            5320             0                 0
11820          11820             1                 1
7491            7491             0                

In [115]:
logreg_all_h1n1.fit(X_train, y_train["h1n1_vaccine"])
logreg_all_season.fit(X_train, y_train["seasonal_vaccine"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=4, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
def compute_accuracy(model, X, y):
    y_pred = model.predict(X)
    return accuracy_score(y, y_pred)

In [117]:
print("H1N1 - Train", 
      compute_accuracy(logreg_all_h1n1, X_train, y_train["h1n1_vaccine"]))

print("H1N1 - Valid", 
      compute_accuracy(logreg_all_h1n1, X_valid, y_valid["h1n1_vaccine"]))

print("Seas - Train", 
      compute_accuracy(logreg_all_season, X_train, y_train["seasonal_vaccine"]))

print("Seas - Valid", 
      compute_accuracy(logreg_all_season, X_valid, y_valid["seasonal_vaccine"]))

H1N1 - Train 0.8349169201965831
H1N1 - Valid 0.8354548858105578
Seas - Train 0.7758483501053124
Seas - Valid 0.7852864095844253


In [0]:
# Prepare test data
df_test = pd.read_csv('/content/drive/My Drive/data/flu/test_set_features.csv',sep=',')
df_test.set_index("respondent_id", inplace=True)

In [129]:
df_test.columns

Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [131]:
# Drop columns
df_test = df_test.drop(columns=discard_cols)
df_test = df_test.fillna(fill_na_values)

KeyError: ignored

In [0]:
df_X_test = pd.concat([df_test.drop(columns=dummy_cols),
           pd.get_dummies(df_test[dummy_cols]),
          ], axis=1)

In [133]:
df_X_test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 26708 entries, 26707 to 53414
Data columns (total 60 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   behavioral_antiviral_meds                 26708 non-null  float64
 1   behavioral_avoidance                      26708 non-null  float64
 2   behavioral_face_mask                      26708 non-null  float64
 3   behavioral_wash_hands                     26708 non-null  float64
 4   behavioral_large_gatherings               26708 non-null  float64
 5   behavioral_outside_home                   26708 non-null  float64
 6   behavioral_touch_face                     26708 non-null  float64
 7   doctor_recc_h1n1                          26708 non-null  float64
 8   doctor_recc_seasonal                      26708 non-null  float64
 9   chronic_med_condition                     26708 non-null  float64
 10  child_under_6_months          

In [0]:
df_X_test['age_group_35 - 44 Years'].value_counts()

In [0]:
y_pred_h1n1 = logreg_all_h1n1.predict_proba(df_X_test)

In [0]:
y_pred_seas = logreg_all_season.predict_proba(df_X_test)

In [139]:
logreg_all_h1n1.predict(df_X_test)

array([0, 0, 0, ..., 0, 0, 1])

In [140]:
y_pred_h1n1

array([[0.91751318, 0.08248682],
       [0.95353344, 0.04646656],
       [0.54362198, 0.45637802],
       ...,
       [0.87489936, 0.12510064],
       [0.93385685, 0.06614315],
       [0.47220463, 0.52779537]])

In [141]:
# Prob of H1N1 Vaccine
y_pred_h1n1[:, 1]

array([0.08248682, 0.04646656, 0.45637802, ..., 0.12510064, 0.06614315,
       0.52779537])

In [142]:
# Prob of Seas Vaccine
y_pred_seas[:, 1]

array([0.23579964, 0.05021137, 0.63485446, ..., 0.19962665, 0.37634272,
       0.46393961])

In [0]:
sub_df = pd.read_csv('/content/drive/My Drive/data/flu/submission_format.csv')

In [146]:
sub_df

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7
...,...,...,...
26703,53410,0.5,0.7
26704,53411,0.5,0.7
26705,53412,0.5,0.7
26706,53413,0.5,0.7


In [0]:
results_df = pd.DataFrame({
    "respondent_id": df_X_test.index,
    "h1n1_vaccine": y_pred_h1n1[:, 1],
    "seasonal_vaccine": y_pred_seas[:, 1]
})

In [0]:
results_df.to_csv("/content/drive/My Drive/data/flu/submission_v1_rf.csv", index=False)

In [0]:
imp_constant = SimpleImputer(strategy='constant', fill_value='missing')
ohe = OneHotEncoder()

In [0]:
imp_ohe = make_pipeline(imp_constant, ohe)
vect = CountVectorizer()
imp = SimpleImputer()

In [0]:
ct = make_column_transformer(
    (imp_ohe, features),
    (imp, features),
    remainder='passthrough')

In [0]:
logreg = LogisticRegression(solver='liblinear', random_state=11)

In [0]:
df_test = df_test[features]

In [22]:
df_test.columns

Index(['age_group', 'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_large_gatherings',
       'behavioral_outside_home', 'behavioral_touch_face',
       'behavioral_wash_hands', 'census_msa', 'child_under_6_months',
       'chronic_med_condition', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'education', 'employment_industry', 'employment_occupation',
       'employment_status', 'h1n1_concern', 'h1n1_knowledge',
       'health_insurance', 'health_worker', 'hhs_geo_region',
       'household_adults', 'household_children', 'income_poverty',
       'marital_status', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
       'opinion_h1n1_vacc_effective', 'opinion_seas_risk',
       'opinion_seas_sick_from_vacc', 'opinion_seas_vacc_effective', 'race',
       'rent_or_own', 'sex'],
      dtype='object')

In [0]:
pipe = make_pipeline(ct, logreg)
pipe.fit(df_X, df_y_h1n1)
pipe.predict(df_test)

In [48]:
df_test.columns

Index(['age_group', 'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_large_gatherings',
       'behavioral_outside_home', 'behavioral_touch_face',
       'behavioral_wash_hands', 'census_msa', 'child_under_6_months',
       'chronic_med_condition', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'education', 'employment_industry', 'employment_occupation',
       'employment_status', 'h1n1_concern', 'h1n1_knowledge',
       'health_insurance', 'health_worker', 'hhs_geo_region',
       'household_adults', 'household_children', 'income_poverty',
       'marital_status', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
       'opinion_h1n1_vacc_effective', 'opinion_seas_risk',
       'opinion_seas_sick_from_vacc', 'opinion_seas_vacc_effective', 'race',
       'rent_or_own', 'sex'],
      dtype='object')

In [49]:
df_train.columns

Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [0]:
from sklearn.model_selection import cross_val_score
%time cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()