In [1]:
#imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,accuracy_score, precision_score, f1_score
from sklearn.metrics import plot_roc_curve, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

In [2]:
#load the data 
tr_features = pd.read_csv(
    "data/training_set_features.csv", 
    index_col="respondent_id"
)
tr_labels = pd.read_csv(
   "data/training_set_labels.csv", 
    index_col="respondent_id"
)

In [3]:
#class balance check
tr_labels['seasonal_vaccine'].value_counts()

0    14272
1    12435
Name: seasonal_vaccine, dtype: int64

In [4]:
#.info

In [5]:
#.describe

In [6]:
#.shape

In [7]:
#.d_types

In [8]:
#miss_value%

In [9]:
#feature exploring 

In [10]:
num_cols = tr_features.columns[tr_features.dtypes != 'object'].values
cat_cols = tr_features.columns[tr_features.dtypes == 'object'].values

In [11]:
#column mapping function 
def col_value_mapper(col, mapping):
    return lambda df: df.assign(**{col:df[col].apply(mapping.get)})

In [12]:
#column mapping function 
def col_value_mapper(col, mapping):
    def update_df(df):
        return df.assign(**{col:df[col].apply(mapping.get)})
    return update_df

In [None]:
tr_features.age_group.map(age_mean_map)

In [15]:
#age_group map and function wrapper
age_mean_map = {'18 - 34 Years': 26,
                '35 - 44 Years': 40,
                '45 - 54 Years': 50,
                '55 - 64 Years': 60,
                '65+ Years': 70, 
                }
age_group_transformer = FunctionTransformer(col_value_mapper("age_group", age_mean_map))

In [16]:
col_value_mapper("age_group", age_mean_map)(tr_features).age_group

respondent_id
0        60
1        40
2        26
3        70
4        50
         ..
26702    70
26703    26
26704    60
26705    26
26706    70
Name: age_group, Length: 26707, dtype: int64

In [None]:
#income_poverty and function wrapper
income_map = {'Below Poverty': 1,
              '<= $75,000, Above Poverty': 2,
              '> $75,000':3
             }
income_transformer = FunctionTransformer(col_value_mapper("income_poverty", income_map))

In [None]:
#Pipelines and column transformer assigining 
seperate_processing_cols = ["age_group", "income_poverty"]

high_missing_cols = []
cat_cols = [x for x in cat_cols if (x not in high_missing_cols and x not in seperate_processing_cols)]
num_cols = [x for x in num_cols if (x not in high_missing_cols and x not in seperate_processing_cols)]
num_to_cat_preprocessing = Pipeline(steps=[
    ('age_group_trasnformer',age_group_transformer),
    ('income_transformer',income_transformer),
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy = 'mean'))
])

numeric_preprocessing = Pipeline(steps=[
    ('standard_scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean'))])

catergoric_preprocessing = Pipeline(steps=[
     ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot_encoder', OneHotEncoder())
])


total_preprocessing = ColumnTransformer(
    transformers = [('num_to_cat_preprocessing',num_to_cat_preprocessing,seperate_processing_cols),
        ('numeric', numeric_preprocessing, num_cols),
        ('categoric',catergoric_preprocessing , cat_cols)  
    ],
    remainder = "drop"
)

In [None]:
#train test split
X_train, X_eval, y_train, y_eval = train_test_split(
    X,
    y,
    test_size=0.33,
    shuffle=True,
    stratify=tr_labels['seasonal_vaccine']
)

In [None]:
#scoring function (cross_val)
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

In [None]:
#scoring function (aoc-roc)
class ModelWithAOCROC(ModelWithCV):
    """Structure for scoring classfication models"""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def plt_roc_curve(self,X=None,y=None):
      
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y
        y_preds = self.model.predict(cv_X)
        plot_roc_curve(self.model,cv_X,cv_y)

In [None]:
#instaniate Dummy Classifier

In [None]:
#score with CV

In [None]:
#score with aoc-roc 

In [None]:
## Logistic Regression Models 

In [None]:
log_reg_pipe=Pipeline([('total_preprocessing',total_preprocessing),
                     ('logreg',LogisticRegression(max_iter=1000))])

In [None]:
log_reg_pipe.fit(X_train,y_train)

In [None]:
logreg_scoring=ModelWithAOCROC(log_reg_pipe, 'logreg', X_train, y_train, cv_now=True)

In [None]:
logreg_scoring.plt_roc_curve()

In [None]:
#RFE

In [None]:
#Grid Search 

In [None]:
#second Log_reg function 

In [None]:
## Random Forest Classifier 

In [None]:
#instantiating pipeline and estimator
rfc_pipe=Pipeline([('total_preprocessing',total_preprocessing),
                     ('rfc',RandomForestClassifier(random_state=42))])

In [None]:
#fitting on training data 
rfc_pipe.fit(X_train,y_train)

In [None]:
#instantiating scoring function 
rfc_scoring=ModelWithAOCROC(rfc_pipe, 'rfc_pipe', X_train, y_train, cv_now=True)

In [None]:
#scoring with Cross_val
rfc_scoring.print_cv_summary

In [None]:
#scoring with aoc-roc
rfc_scoring.plt_roc_curve()

In [None]:
#feature_importance
rfc.feature_importances_
feat_import = {name: score 
                   for name, score 
                       in zip(X_train.columns, rfc.feature_importances_)
}
feat_import

In [None]:
#grid Search

In [None]:
## Extra-Tree Classifier

In [None]:
etc = ExtraTreesClassifier(max_features='sqrt', max_samples=.5, bootstrap=True, random_state=42)
etc_pipe= Pipeline([('total_preprocessing',total_preprocessing), ('etc', etc)])

In [None]:
#fitting on training data 
etc_pipe.fit(X_train,y_train)

In [None]:
#instantiating scoring function 
etc_scoring=ModelWithAOCROC(etc_pipe, 'etc_pipe', X_train, y_train, cv_now=True)

In [None]:
#scoring with Cross_val
etc_scoring.print_cv_summary

In [None]:
etc_scoring.plt_roc_curve()

In [None]:
## Gradient Boosting Classifier

In [None]:
gbc_pipe = Pipeline([('total_preprocessing',total_preprocessing), ('gbc', GradientBoostingClassifier(random_state=42))])

In [None]:
#fitting on training data 
gbc_pipe.fit(X_train,y_train)

In [None]:
#instantiating scoring function 
gbc_scoring=ModelWithAOCROC(gbc_pipe, 'gbc_pipe', X_train, y_train, cv_now=True)

In [None]:
#scoring with Cross_val
gbc_scoring.print_cv_summary

In [None]:
gbc_scoring.plt_roc_curve()

In [None]:
## Voting Classifier

In [None]:
vc = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('knn', KNeighborsClassifier(n_neighbors=3)),
    ('ct', DecisionTreeClassifier(random_state=42))
])

In [None]:
vc_pipe=Pipeline([('total_preprocessing',total_preprocessing), ('vc',vc)

In [None]:
#fitting on training data 
vc_pipe.fit(X_train,y_train)

In [None]:
#instantiating scoring function 
vc_scoring=ModelWithAOCROC(vc_pipe, 'vc_pipe', X_train, y_train, cv_now=True)

In [None]:
#scoring with Cross_val
vc_scoring.print_cv_summary

In [None]:
vc_scoring.plt_roc_curve()

In [None]:
## Bagging

In [None]:
## XG-Boost

In [None]:
xgc = xgboost.XGBClassifier(random_state=42, objective='binary:logistic').fit(X_train2, y_train2)

In [None]:
## Stacking Classifier

In [None]:
stc = StackingClassifier(estimators=[
    ('log_reg', LogisticRegression(random_state=42)),
    ('rfc', RandomForestClassifier(random_state=42)),
    ('gbc', GradientBoostingClassifier(random_state=42))
])
stc_pipe= Pipeline([('total_preprocessing',total_preprocessing), ('stc', stc)])

In [None]:
## XG-Boost + logistic Regression

In [None]:
## Final Model Selection 