In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Import imblearn dependencies
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline
from collections import Counter
from numpy import where

In [2]:
cleaned_water_pump = pd.read_csv("./Data/cleaned_water_pump.csv")

In [3]:
cleaned_water_pump.isna().sum()

amount_tsh                     0
gps_height                     0
basin                          0
region                         0
region_code                    0
lga                            0
public_meeting                 0
scheme_management              0
permit                         0
extraction_type_group          0
extraction_type_class          0
management                     0
management_group               0
payment                        0
quality_group                  0
quantity                       0
source                         0
source_class                   0
waterpoint_type                0
status_group                   0
unknown_construction_yr        0
age_at_inspection          20708
water_per_person           21380
dtype: int64

In [4]:
cleaned_water_pump["region_code"]= cleaned_water_pump["region_code"].astype(str)

In [5]:
cleaned_water_pump.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59399 entries, 0 to 59398
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   amount_tsh               59399 non-null  float64
 1   gps_height               59399 non-null  int64  
 2   basin                    59399 non-null  object 
 3   region                   59399 non-null  object 
 4   region_code              59399 non-null  object 
 5   lga                      59399 non-null  object 
 6   public_meeting           59399 non-null  object 
 7   scheme_management        59399 non-null  object 
 8   permit                   59399 non-null  object 
 9   extraction_type_group    59399 non-null  object 
 10  extraction_type_class    59399 non-null  object 
 11  management               59399 non-null  object 
 12  management_group         59399 non-null  object 
 13  payment                  59399 non-null  object 
 14  quality_group         

In [6]:
#Set X and Y 
X = cleaned_water_pump.drop("status_group", axis=1)
y = cleaned_water_pump["status_group"]

In [7]:
#Break into numeric and categorical columns 
numeric_cols = ["amount_tsh", "gps_height", 
                "age_at_inspection", "water_per_person"]
cat_cols =["basin", "region", "region_code", "lga", "public_meeting", "scheme_management", "permit",
          "extraction_type_group", "extraction_type_class", "management", "management_group", "payment", "quality_group",
          "quantity", "source", "source_class", "waterpoint_type"]
bool_cols = ["unknown_construction_yr"]

In [8]:
#Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Pipeline 

In [9]:
#Create subtypes
subpipe_num = Pipeline(steps=[("num_impute", SimpleImputer(strategy="mean")),("ss", StandardScaler())])
subpipe_cat = Pipeline(steps=[("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"))])
subpipe_ord = Pipeline(steps= [("ord_encode", OrdinalEncoder())])

In [10]:
#Column Transformers
CT = ColumnTransformer(transformers=[
                                    ("subpipe_num", subpipe_num, numeric_cols), 
                                     ("subpipe_cat", subpipe_cat, cat_cols),
                                    ("subpipe_ord", subpipe_ord, bool_cols)
                                    ], 
                       remainder="drop"
                      )

In [11]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        Args:
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)
        '''
        cv_X = X if X else self.X
        cv_y = y if y else self.y
        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )
        return ax

## Dummy Model

In [12]:
dummy_model_pipe = Pipeline(steps=[('ct', CT), ('dummy_model', DummyClassifier(strategy="most_frequent",random_state=42))])

In [13]:
dummy_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'age_at_inspection',
                                                   'water_per_person']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                 

In [14]:
#Accuracy Scores 
dummy_model_pipe.score(X_train, y_train)


0.5416058721856832

In [15]:
dummy_model_results = ModelWithCV(
                                  model = dummy_model_pipe,
                                  model_name = "dummy",
                                  X=X_train,
                                  y=y_train
)

In [16]:
dummy_model_results.print_cv_summary()

CV Results for `dummy` model:
            0.54161 ± 0.00007 accuracy
        


In [None]:
#Print classification report

y_preds = dummy_model_pipe(X_test)

print(classification_report(y_test, y_preds)

## Logistic Regression Model

In [None]:
logreg_model_pipe = Pipeline(steps=[('ct', CT), ('log_reg_model', LogisticRegression(max_iter = 850, random_state=42))])


In [None]:
logreg_model_pipe.fit(X_train, y_train)

In [None]:
logreg_model_pipe.score(X_train, y_train)

In [None]:
logreg_model_results = ModelWithCV(
                                  model = logreg_model_pipe,
                                  model_name = "logreg",
                                  X=X_train,
                                  y=y_train
)

In [None]:
logreg_model_results.print_cv_summary()

In [None]:
#Print classification report
y_preds = logreg_model_pipe(X_test)

print(classification_report(y_test, y_preds)

## Random Forest Model

In [None]:
rfc_model_pipe = Pipeline(steps=[('ct', CT), ('rfc_model', RandomForestClassifier(random_state=42))])


In [None]:
rfc_model_pipe.fit(X_train, y_train)

In [None]:
rfc_model_pipe.score(X_train, y_train)

In [None]:
rfc_model_results = ModelWithCV(
                                  model = rfc_model_pipe,
                                  model_name = "rfc",
                                  X=X_train,
                                  y=y_train
)

In [None]:
rfc_model_results.print_cv_summary()

In [None]:
#Print classification report
y_preds = rfc_model_pipe(X_test)

print(classification_report(y_test, y_preds)

## GridSearch 

In [None]:
#Grid search using params for our rfc model 
params = { "rfc_model__n_estimators": [50, 100, 150],
        "rfc_model__criterion": ["gini", "entropy"],
         "rfc_model__min_samples_leaf": [5, 10, 30]}

gs = GridSearchCV(estimator = rfc_model_pipe,
                 param_grid=params,
                 cv=5)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_score_

In [None]:
gs.cv_results_

In [None]:
## Pipeline 2 with median as strategy
subpipe_num2 = Pipeline(steps=[("num_impute", SimpleImputer(strategy="median")),("ss", StandardScaler())])
subpipe_cat2 = Pipeline(steps=[("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"))])
subpipe_ord2 = Pipeline(steps= [("ord_encode", OrdinalEncoder())])

#Column Transformers with subpipe_2
CT_2 = ColumnTransformer(transformers=[
                                    ("subpipe_num", subpipe_num2, numeric_cols), 
                                     ("subpipe_cat", subpipe_cat2, cat_cols),
                                    ("subpipe_ord", subpipe_ord2, bool_cols)
                                    ], 
                       remainder="drop"
                      )


rfc_model_pipe2 = Pipeline(steps=[('ct_2', CT_2), ('rfc_model_2', RandomForestClassifier(random_state=42))])
rfc_model_pipe2.fit(X_train, y_train)

In [None]:
#Grid search using params for our rfc model 2
params = { "rfc_model_2__n_estimators": [50, 100, 150],
        "rfc_model_2__criterion": ["gini", "entropy"],
         "rfc_model_2__min_samples_leaf": [5, 10, 30]}

gs2 = GridSearchCV(estimator = rfc_model_pipe2,
                 param_grid=params,
                 cv=5)

In [None]:
gs2.fit(X_train, y_train)

In [None]:
gs2.best_params_

In [None]:
gs2.best_score_

In [None]:
gs2.cv_results_['mean_test_score'].mean()

In [None]:
params_3 = { "rfc_model__n_estimators": [50, 100, 150],
        "rfc_model__criterion": ["gini", "entropy"],
         "rfc_model__min_samples_split": [2, 6, 10]}

gs3 = GridSearchCV(estimator = rfc_model_pipe,
                 param_grid=params_3,
                 cv=5)

In [None]:
gs3.fit(X_train, y_train)

In [None]:
gs3.best_score_

In [None]:
gs3.cv_results_['mean_test_score'].mean()

In [None]:
gs3.best_params_

In [None]:
params_4 = { "rfc_model__n_estimators": [100, 150, 200],
        "rfc_model__criterion": ["gini", "entropy"],
         "rfc_model__min_samples_split": [10, 20, 30]
           }

gs4 = GridSearchCV(estimator = rfc_model_pipe,
                 param_grid=params_4,
                 cv=5)

In [None]:
gs4.fit(X_train, y_train)

In [None]:
gs4.best_score_

In [None]:
gs4.cv_results_["mean_test_score"].mean()

## Using Smote: Oversamping Minority and Undersampling Majority Classes

In [24]:
# Check the class distribution of the target
counter = Counter(y_train)
print(counter)

Counter({0: 24128, 2: 17150, 1: 3271})


In [25]:
# Create the oversampler and undersampler objects
over = SMOTE(sampling_strategy="minority", random_state=42)
under = RandomUnderSampler(sampling_strategy="not minority", random_state=42)

In [33]:
imbal_rf_pipe = imbPipeline(steps=[('ct', CT), ("over", over),
                                ("under", under), 
                             ('rfc_model', RandomForestClassifier(random_state=42))])

In [34]:
params_imb = { "rfc_model__n_estimators": [50, 100, 150],
               "rfc_model__criterion": ["gini", "entropy"],
              "rfc_model__min_samples_split": [2, 6, 10]
             }

gs_imb = GridSearchCV(estimator = imbal_rf_pipe,
                 param_grid=params_imb,
                 cv=5)

In [35]:
gs_imb.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer()),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'age_at_inspection',
                                                                          'water_per_person']),
                                                             

In [40]:
gs_imb.cv_results_['mean_test_score'].mean()

0.7587089997321016

In [41]:
gs_imb.best_score_

0.7626884023381948

In [None]:
gs_imb.best_estimator_

In [97]:
y_pred_smote_1 = gs_imb.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred_smote_1))

              precision    recall  f1-score   support

           0       0.83      0.78      0.81      8131
           1       0.32      0.51      0.39      1046
           2       0.80      0.78      0.79      5673

    accuracy                           0.76     14850
   macro avg       0.65      0.69      0.66     14850
weighted avg       0.78      0.76      0.77     14850



## GridSearch Random Forest With Smote only Oversampling Minority

In [42]:
## Second SMOTE using only oversampling of minority class (our target of interest functional needs repair)
imbal_rf_pipe_2 = imbPipeline(steps=[('ct', CT), ("over", over), 
                             ('rfc_model', RandomForestClassifier(random_state=42))])

params_imb = { "rfc_model__n_estimators": [50, 100, 150],
               "rfc_model__criterion": ["gini", "entropy"],
              "rfc_model__min_samples_split": [2, 6, 10]
             }

gs_imb_2 = GridSearchCV(estimator = imbal_rf_pipe_2,
                 param_grid=params_imb,
                 cv=5)

gs_imb_2.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer()),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'age_at_inspection',
                                                                          'water_per_person']),
                                                             

In [44]:
gs_imb_2.cv_results_['mean_test_score'].mean()

0.7653920310292351

In [56]:
gs_imb_2.best_params_

{'rfc_model__criterion': 'gini',
 'rfc_model__min_samples_split': 6,
 'rfc_model__n_estimators': 150}

In [60]:
rfc_smote_model_pipe = imbPipeline(steps=[('ct', CT), ("over", over), ('rfc_model', 
                                     RandomForestClassifier(criterion= "gini", min_samples_split=6, 
                                                            n_estimators=150, random_state=42))])
rfc_smote_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'age_at_inspection',
                                                   'water_per_person']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                 

In [48]:
# Save off best estimator for the SMOTE calc
random_forest_smote = gs_imb_2.best_estimator_

In [47]:
gs_imb_2.best_estimator_.score

<function sklearn.pipeline.Pipeline.score(self, X, y=None, sample_weight=None)>

In [51]:
y_pred_smote = random_forest_smote.predict(X_test)
print(classification_report(y_test, y_pred_smote))

              precision    recall  f1-score   support

           0       0.81      0.82      0.82      8131
           1       0.33      0.50      0.40      1046
           2       0.83      0.74      0.78      5673

    accuracy                           0.77     14850
   macro avg       0.66      0.69      0.67     14850
weighted avg       0.79      0.77      0.77     14850



In [85]:
rfc_importances = random_forest_smote.named_steps["rfc_model"].feature_importances_

In [95]:
for feat, importance in zip(X_train.columns, rfc_importances):
    print ('feature: {f}, importance: {i}'.format(f=feat, i=importance))

feature: amount_tsh, importance: 0.021480644630357645
feature: gps_height, importance: 0.09190549151127303
feature: basin, importance: 0.07170842218544114
feature: region, importance: 0.034854233220206574
feature: region_code, importance: 0.006290325002662925
feature: lga, importance: 0.004827584204586797
feature: public_meeting, importance: 0.005131176415000627
feature: scheme_management, importance: 0.004773467915207898
feature: permit, importance: 0.006314355107130706
feature: extraction_type_group, importance: 0.004307141031462322
feature: extraction_type_class, importance: 0.004931888498414721
feature: management, importance: 0.003419161543292744
feature: management_group, importance: 0.0035887549639737593
feature: payment, importance: 0.0023491730965694044
feature: quality_group, importance: 0.0004198644936653595
feature: quantity, importance: 0.0016139427717211342
feature: source, importance: 0.00658936770468032
feature: source_class, importance: 0.0017850262794432837
feature: w

In [None]:
#basin, gps_height,region seem to be the most important 
