In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline


In [2]:
sub_format = pd.read_csv('data/submission_format.csv') 
sub_format

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7
...,...,...,...
26703,53410,0.5,0.7
26704,53411,0.5,0.7
26705,53412,0.5,0.7
26706,53413,0.5,0.7


In [3]:
tr_features = pd.read_csv('data/training_set_features.csv')

In [5]:
tr_labels =pd.read_csv('data/training_set_labels.csv')
tr_labels

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0
...,...,...,...
26702,26702,0,0
26703,26703,0,0
26704,26704,0,1
26705,26705,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   respondent_id     26707 non-null  int64
 1   h1n1_vaccine      26707 non-null  int64
 2   seasonal_vaccine  26707 non-null  int64
dtypes: int64(3)
memory usage: 626.1 KB


In [12]:
te_features=pd.read_csv('data/test_set_features.csv')

In [14]:
tr_features.dtypes

respondent_id                    int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [17]:
tr_features.education.value_counts()

College Graduate    10097
Some College         7043
12 Years             5797
< 12 Years           2363
Name: education, dtype: int64

In [19]:
tr_features.age_group.value_counts()

65+ Years        6843
55 - 64 Years    5563
45 - 54 Years    5238
18 - 34 Years    5215
35 - 44 Years    3848
Name: age_group, dtype: int64

In [30]:
len(tr_features.count())

respondent_id                  26707
h1n1_concern                   26615
h1n1_knowledge                 26591
behavioral_antiviral_meds      26636
behavioral_avoidance           26499
behavioral_face_mask           26688
behavioral_wash_hands          26665
behavioral_large_gatherings    26620
behavioral_outside_home        26625
behavioral_touch_face          26579
doctor_recc_h1n1               24547
doctor_recc_seasonal           24547
chronic_med_condition          25736
child_under_6_months           25887
health_worker                  25903
health_insurance               14433
opinion_h1n1_vacc_effective    26316
opinion_h1n1_risk              26319
opinion_h1n1_sick_from_vacc    26312
opinion_seas_vacc_effective    26245
opinion_seas_risk              26193
opinion_seas_sick_from_vacc    26170
age_group                      26707
education                      25300
race                           26707
sex                            26707
income_poverty                 22284
m

In [31]:
len(tr_features)

26707

In [32]:
(tr_features.isna().sum())/len(tr_features)

respondent_id                  0.000000
h1n1_concern                   0.003445
h1n1_knowledge                 0.004343
behavioral_antiviral_meds      0.002658
behavioral_avoidance           0.007788
behavioral_face_mask           0.000711
behavioral_wash_hands          0.001573
behavioral_large_gatherings    0.003258
behavioral_outside_home        0.003070
behavioral_touch_face          0.004793
doctor_recc_h1n1               0.080878
doctor_recc_seasonal           0.080878
chronic_med_condition          0.036358
child_under_6_months           0.030704
health_worker                  0.030104
health_insurance               0.459580
opinion_h1n1_vacc_effective    0.014640
opinion_h1n1_risk              0.014528
opinion_h1n1_sick_from_vacc    0.014790
opinion_seas_vacc_effective    0.017299
opinion_seas_risk              0.019246
opinion_seas_sick_from_vacc    0.020107
age_group                      0.000000
education                      0.052683
race                           0.000000


In [37]:
tr_features.health_insurance.value_counts()

1.0    12697
0.0     1736
Name: health_insurance, dtype: int64

In [42]:
tr_features['health_insurance'].fillna(value=0).value_counts()

0.0    14010
1.0    12697
Name: health_insurance, dtype: int64

In [80]:
tr_features

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


In [None]:
pipe = Pipeline (steps =[('ohe',OneHotEncoder()),
                        ('ss',Standard)])

In [76]:
from sklearn.base import BaseEstimator, TransformerMixin

class RemoveNaTransformer(TransformerMixin):
    
    def __init__(self):
        super().__init__()
        
    def fit(self, *args, **kwargs):
        return "passthrough"
    
    
    def fit_transform(self, *args, **kwargs):
        return self.transform(*args, **kwargs)
    
    def transform(self, X, y=None, **fit_params):
        X.
        print(X)
        return X

In [109]:
GrabNumeric = FunctionTransformer(lambda df: df.select_dtypes(include=['float', 'int']))

In [110]:
GrabObject = FunctionTransformer(lambda df: df.select_dtypes(include=['object']))

In [136]:
def ohe(df):
    cols = df.select_dtypes(include=['object']).columns
    temp_df = df
    for col in cols:
        encoder = OneHotEncoder(handle_unknown="ignore")
        fitted_arr = encoder.fit_transform(temp_df[[col]]).toarray()
        col_names = [f"{col}_{f}" for f in encoder.get_feature_names_out()]
        encoded_values = pd.DataFrame(fitted_arr, columns=col_names)
        temp_df = temp_df.join(encoded_values)
    return temp_df
OneHotEncoderWithColNames = FunctionTransformer(ohe)

In [142]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

subpipe_num = Pipeline(steps=[('Grab_numeric',GrabNumeric),
                              ('num_impute', SimpleImputer()),
                              ('ss', StandardScaler())])


subpipe_cat = Pipeline(steps=[# ('Grab_object',GrabObject),
                              # ('cat_impute', SimpleImputer(strategy='most_frequent')),
                              ('ohe', OneHotEncoderWithColNames)])


In [144]:
ct = ColumnTransformer(transformers = [
    ('subpipe_num', subpipe_num, tr_features.columns),
   #  ('subpipe_cat', subpipe_cat, tr_features.columns)
])

In [145]:
ct.fit(tr_features)
array = ct.transform(tr_features)

In [148]:
subpipe_cat.transform(tr_features)

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_employment_occupation_rcertsgn,employment_occupation_employment_occupation_tfqavkke,employment_occupation_employment_occupation_ukymxvdu,employment_occupation_employment_occupation_uqqtjvyb,employment_occupation_employment_occupation_vlluhbov,employment_occupation_employment_occupation_xgwztkwe,employment_occupation_employment_occupation_xqwwgdyp,employment_occupation_employment_occupation_xtkaffoo,employment_occupation_employment_occupation_xzmlyyjv,employment_occupation_employment_occupation_nan
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
pd.DataFrame(array)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,139,140,141,142,143,144,145,146,147,148
0,-1.73199,-0.680609,-2.04693,-0.226911,-1.63255,-0.272298,-2.17758,-0.749009,1.4038,0.691971,...,0,0,0,0,0,0,0,0,0,1
1,-1.73186,1.52028,1.19565,-0.226911,0.617345,-0.272298,0.459948,-0.749009,1.4038,0.691971,...,0,0,0,0,0,1,0,0,0,0
2,-1.73173,-0.680609,-0.425641,-0.226911,0.617345,-0.272298,-2.17758,-0.749009,-0.714548,-1.45211,...,0,0,0,0,0,0,0,1,0,0
3,-1.7316,-0.680609,-0.425641,-0.226911,0.617345,-0.272298,0.459948,1.33946,-0.714548,-1.45211,...,0,0,0,0,0,0,0,0,0,1
4,-1.73147,0.419835,-0.425641,-0.226911,0.617345,-0.272298,0.459948,1.33946,-0.714548,0.691971,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,1.73147,0.419835,-2.04693,-0.226911,0.617345,-0.272298,-2.17758,-0.749009,1.4038,-1.45211,...,0,0,0,0,0,0,0,0,0,1
26703,1.7316,-0.680609,1.19565,-0.226911,0.617345,-0.272298,0.459948,-0.749009,-0.714548,-1.45211,...,0,0,0,0,0,0,0,0,0,0
26704,1.73173,0.419835,1.19565,-0.226911,0.617345,3.67506,0.459948,1.33946,-0.714548,0.691971,...,0,0,0,0,0,0,0,0,0,1
26705,1.73186,-0.680609,-0.425641,-0.226911,-1.63255,-0.272298,-2.17758,-0.749009,-0.714548,0,...,0,0,0,0,0,0,0,0,0,0
