In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
cleaned_water_pump = pd.read_csv("./Data/cleaned_water_pump.csv")

In [3]:
cleaned_water_pump.isna().sum()

amount_tsh                     0
gps_height                     0
basin                          0
region                         0
region_code                    0
lga                            0
public_meeting                 0
scheme_management              0
permit                         0
extraction_type_group          0
extraction_type_class          0
management                     0
management_group               0
payment                        0
quality_group                  0
quantity                       0
source                         0
source_class                   0
waterpoint_type                0
status_group                   0
unknown_construction_yr        0
age_at_inspection          20708
water_per_person           21380
dtype: int64

In [4]:
cleaned_water_pump["region_code"]= cleaned_water_pump["region_code"].astype(str)

In [5]:
cleaned_water_pump.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59399 entries, 0 to 59398
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   amount_tsh               59399 non-null  float64
 1   gps_height               59399 non-null  int64  
 2   basin                    59399 non-null  object 
 3   region                   59399 non-null  object 
 4   region_code              59399 non-null  object 
 5   lga                      59399 non-null  object 
 6   public_meeting           59399 non-null  object 
 7   scheme_management        59399 non-null  object 
 8   permit                   59399 non-null  object 
 9   extraction_type_group    59399 non-null  object 
 10  extraction_type_class    59399 non-null  object 
 11  management               59399 non-null  object 
 12  management_group         59399 non-null  object 
 13  payment                  59399 non-null  object 
 14  quality_group         

In [6]:
#Set X and Y 
X = cleaned_water_pump.drop("status_group", axis=1)
y = cleaned_water_pump["status_group"]

In [7]:
#Break into numeric and categorical columns 
numeric_cols = ["amount_tsh", "gps_height", 
                "age_at_inspection", "water_per_person"]
cat_cols =["basin", "region", "region_code", "lga", "public_meeting", "scheme_management", "permit",
          "extraction_type_group", "extraction_type_class", "management", "management_group", "payment", "quality_group",
          "quantity", "source", "source_class", "waterpoint_type"]
bool_cols = ["unknown_construction_yr"]

In [8]:
#Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Pipeline 

In [9]:
#Create subtypes
subpipe_num = Pipeline(steps=[("num_impute", SimpleImputer(strategy="mean")),("ss", StandardScaler())])
subpipe_cat = Pipeline(steps=[("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"))])
subpipe_ord = Pipeline(steps= [("ord_encode", OrdinalEncoder())])

In [10]:
#Column Transformers
CT = ColumnTransformer(transformers=[
                                    ("subpipe_num", subpipe_num, numeric_cols), 
                                     ("subpipe_cat", subpipe_cat, cat_cols),
                                    ("subpipe_ord", subpipe_ord, bool_cols)
                                    ], 
                       remainder="drop"
                      )

In [11]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        Args:
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)
        '''
        cv_X = X if X else self.X
        cv_y = y if y else self.y
        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )
        return ax

## Dummy Model

In [18]:
dummy_model_pipe = Pipeline(steps=[('ct', CT), ('dummy_model', DummyClassifier(strategy="most_frequent",random_state=42))])

In [19]:
dummy_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'age_at_inspection',
                                                   'water_per_person']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                 

In [27]:
#Accuracy Scores 
dummy_model_pipe.score(X_train, y_train)


0.5416058721856832

In [33]:
dummy_model_results = ModelWithCV(
                                  model = dummy_model_pipe,
                                  model_name = "dummy",
                                  X=X_train,
                                  y=y_train
)

In [34]:
dummy_model_results.print_cv_summary()

CV Results for `dummy` model:
            0.54161 ± 0.00007 accuracy
        


## Logistic Regression Model

In [45]:
logreg_model_pipe = Pipeline(steps=[('ct', CT), ('log_reg_model', LogisticRegression(max_iter = 850, random_state=42))])


In [46]:
logreg_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'age_at_inspection',
                                                   'water_per_person']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                 

In [47]:
logreg_model_pipe.score(X_train, y_train)

0.7485240970616625

In [49]:
logreg_model_results = ModelWithCV(
                                  model = logreg_model_pipe,
                                  model_name = "logreg",
                                  X=X_train,
                                  y=y_train
)

KeyboardInterrupt: 

In [39]:
logreg_model_results.print_cv_summary()

CV Results for `logreg` model:
            0.74648 ± 0.00439 accuracy
        


## Random Forest Model

In [12]:
rfc_model_pipe = Pipeline(steps=[('ct', CT), ('rfc_model', RandomForestClassifier(random_state=42))])


In [13]:
rfc_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'age_at_inspection',
                                                   'water_per_person']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                 

In [14]:
rfc_model_pipe.score(X_train, y_train)

0.9423780556241442

In [55]:
rfc_model_results = ModelWithCV(
                                  model = rfc_model_pipe,
                                  model_name = "rfc",
                                  X=X_train,
                                  y=y_train
)

In [56]:
rfc_model_results.print_cv_summary()

CV Results for `rfc` model:
            0.78219 ± 0.00479 accuracy
        


## GridSearch 

In [21]:
#Grid search using params for our rfc model 
params = { "rfc_model__n_estimators": [50, 100, 150],
        "rfc_model__criterion": ["gini", "entropy"],
         "rfc_model__min_samples_leaf": [5, 10, 30]}

gs = GridSearchCV(estimator = rfc_model_pipe,
                 param_grid=params,
                 cv=5)

In [23]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer()),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'age_at_inspection',
                                                                          'water_per_person']),
                                                             

In [24]:
gs.best_params_

{'rfc_model__criterion': 'gini',
 'rfc_model__min_samples_leaf': 5,
 'rfc_model__n_estimators': 100}

In [27]:
gs.best_score_

0.7804440231753436

In [29]:
gs.cv_results_

{'mean_fit_time': array([  5.40757856,   9.88805556,  15.07582121,  21.04184361,
         12.79286265,  14.97028031,   4.72723813,   9.16346955,
         73.5260406 ,   5.17217588, 564.30286055,  20.69839969,
          5.17379637,  10.42404428,  20.61835146,   9.51312976,
         10.1409194 ,  16.09189024]),
 'std_fit_time': array([2.63697349e-01, 4.49942386e-01, 1.72049266e-01, 3.20517363e+01,
        2.92611189e+00, 5.34381674e-01, 9.56950530e-02, 2.10217088e-01,
        1.19710333e+02, 6.31770137e-02, 1.09409308e+03, 3.72875547e+00,
        1.83440139e-01, 8.58504551e-01, 2.41287714e+00, 2.61281976e+00,
        9.01194355e-01, 1.02656568e+00]),
 'mean_score_time': array([0.22175984, 0.30204206, 0.5166172 , 0.24997692, 0.43987908,
        0.42151241, 0.18393192, 0.3611681 , 0.40326872, 0.2041172 ,
        0.41114655, 0.59179134, 0.19913406, 0.3165576 , 0.56190205,
        0.37412095, 0.32683754, 0.44885173]),
 'std_score_time': array([0.02245808, 0.03110241, 0.17063114, 0.06763397, 

In [31]:
## Pipeline 2 with median as strategy
subpipe_num2 = Pipeline(steps=[("num_impute", SimpleImputer(strategy="median")),("ss", StandardScaler())])
subpipe_cat2 = Pipeline(steps=[("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"))])
subpipe_ord2 = Pipeline(steps= [("ord_encode", OrdinalEncoder())])

#Column Transformers with subpipe_2
CT_2 = ColumnTransformer(transformers=[
                                    ("subpipe_num", subpipe_num2, numeric_cols), 
                                     ("subpipe_cat", subpipe_cat2, cat_cols),
                                    ("subpipe_ord", subpipe_ord2, bool_cols)
                                    ], 
                       remainder="drop"
                      )


rfc_model_pipe2 = Pipeline(steps=[('ct_2', CT_2), ('rfc_model_2', RandomForestClassifier(random_state=42))])
rfc_model_pipe2.fit(X_train, y_train)

Pipeline(steps=[('ct_2',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'age_at_inspection',
                                                   'water_per_person']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))])

In [33]:
#Grid search using params for our rfc model 2
params = { "rfc_model_2__n_estimators": [50, 100, 150],
        "rfc_model_2__criterion": ["gini", "entropy"],
         "rfc_model_2__min_samples_leaf": [5, 10, 30]}

gs2 = GridSearchCV(estimator = rfc_model_pipe2,
                 param_grid=params,
                 cv=5)

In [34]:
gs2.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct_2',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'age_at_inspection',
                                                                          'water_per_person']),
                                          

In [35]:
gs2.best_params_

{'rfc_model_2__criterion': 'gini',
 'rfc_model_2__min_samples_leaf': 5,
 'rfc_model_2__n_estimators': 50}

In [36]:
gs2.best_score_

0.7792543033003989

In [40]:
gs2.cv_results_['mean_test_score'].mean()

0.7662324269118896

In [41]:
params_3 = { "rfc_model__n_estimators": [50, 100, 150],
        "rfc_model__criterion": ["gini", "entropy"],
         "rfc_model__min_samples_split": [2, 6, 10]}

gs3 = GridSearchCV(estimator = rfc_model_pipe,
                 param_grid=params_3,
                 cv=5)

In [42]:
gs3.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer()),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'age_at_inspection',
                                                                          'water_per_person']),
                                                             

In [43]:
gs3.best_score_

0.7926777509319509

In [45]:
gs3.cv_results_['mean_test_score'].mean()

0.7871694819091334

In [44]:
gs3.best_params_

{'rfc_model__criterion': 'entropy',
 'rfc_model__min_samples_split': 10,
 'rfc_model__n_estimators': 150}

In [46]:
params_4 = { "rfc_model__n_estimators": [100, 150, 200],
        "rfc_model__criterion": ["gini", "entropy"],
         "rfc_model__min_samples_split": [10, 20, 30]
           }

gs4 = GridSearchCV(estimator = rfc_model_pipe,
                 param_grid=params_4,
                 cv=5)

In [47]:
gs4.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer()),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'age_at_inspection',
                                                                          'water_per_person']),
                                                             

In [48]:
gs4.best_score_

0.7930593446468779

In [49]:
gs4.cv_results_["mean_test_score"].mean()

0.7901723682939514