Modeling Notebook

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


from sklearn.metrics import confusion_matrix, plot_confusion_matrix, precision_score 
from sklearn.metrics import recall_score, accuracy_score, f1_score, log_loss
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, plot_roc_curve
from sklearn.metrics import classification_report, auc

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import MissingIndicator, SimpleImputer


from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline





%matplotlib inline

In [2]:
water_pump = pd.read_csv('Data/cleaned_water_pump.csv')
water_pump.head()

Unnamed: 0,amount_tsh,gps_height,basin,region,region_code,lga,public_meeting,scheme_management,permit,extraction_type_group,...,payment,quality_group,quantity,source,source_class,waterpoint_type,status_group,unknown_construction_yr,age_at_inspection,water_per_person
0,6000.0,1390,Lake Nyasa,Iringa,11,Ludewa,True,VWC,False,gravity,...,pay annually,good,enough,spring,groundwater,communal standpipe,0,False,12.0,55.045872
1,0.0,1399,Lake Victoria,Mara,20,Serengeti,Unknown,Other,True,gravity,...,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,0,False,3.0,0.0
2,25.0,686,Pangani,Manyara,21,Simanjiro,True,VWC,True,gravity,...,pay per bucket,good,enough,dam,surface,communal standpipe multiple,0,False,4.0,0.1
3,0.0,263,Ruvuma / Southern Coast,Mtwara,90,Nanyumbu,True,VWC,True,submersible,...,never pay,good,dry,machine dbh,groundwater,communal standpipe multiple,2,False,27.0,0.0
4,0.0,0,Lake Victoria,Kagera,18,Karagwe,True,Unknown,True,gravity,...,never pay,good,seasonal,rainwater harvesting,surface,communal standpipe,0,True,,


In [3]:
water_pump['region_code'] = water_pump['region_code'].astype('str')

In [4]:
water_pump.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59399 entries, 0 to 59398
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   amount_tsh               59399 non-null  float64
 1   gps_height               59399 non-null  int64  
 2   basin                    59399 non-null  object 
 3   region                   59399 non-null  object 
 4   region_code              59399 non-null  object 
 5   lga                      59399 non-null  object 
 6   public_meeting           59399 non-null  object 
 7   scheme_management        59399 non-null  object 
 8   permit                   59399 non-null  object 
 9   extraction_type_group    59399 non-null  object 
 10  extraction_type_class    59399 non-null  object 
 11  management               59399 non-null  object 
 12  management_group         59399 non-null  object 
 13  payment                  59399 non-null  object 
 14  quality_group         

## Dummy

In [None]:
fig, ax = plt.subplots()

ax = dummy_model_results.plot_cv(ax)
plt.tight_layout();

dummy_model_results.print_cv_summary()

In [None]:
fig, ax = plt.subplots()

fig.suptitle("Dummy Model")

#Plot confusion matrix
plot_confusion_matrix(dummy_model, X_train, y_train, ax =ax);

In [None]:
dummy_model_results.print_cv_summary()

In [5]:
#function from lecture
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

## Pipeline

In [6]:
X = water_pump.drop('status_group', axis=1)
y = water_pump['status_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
numeric_cols = ["amount_tsh", "gps_height", "age_at_inspection", "water_per_person"]

cat_cols =["basin", "region", "region_code", "lga", "public_meeting", "scheme_management", "permit",
          "extraction_type_group", "extraction_type_class", "management", "management_group", "payment", "quality_group",
          "quantity", "source", "source_class", "waterpoint_type"]
bool_cols = ["unknown_construction_yr"]

In [8]:
subpipe_num = Pipeline(steps=[("num_impute", SimpleImputer(strategy="mean")),("ss", StandardScaler())])
subpipe_cat = Pipeline(steps=[("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"))])
subpipe_ord = Pipeline(steps= [("ord_encode", OrdinalEncoder())])

In [9]:
#Column Transformers
CT = ColumnTransformer(transformers=[
                                    ("subpipe_num", subpipe_num, numeric_cols), 
                                     ("subpipe_cat", subpipe_cat, cat_cols),
                                    ("subpipe_ord", subpipe_ord, bool_cols)
                                    ], 
                       remainder="drop"
                      )

In [None]:
dummy_model_pipe = Pipeline(steps=[('ct', CT), ('dummy_model', 
                                                DummyClassifier(strategy="most_frequent",random_state=42))])

In [None]:
dummy_model_pipe.fit(X_train, y_train)

In [None]:
dummy_model_pipe.score(X_train, y_train)

In [None]:
dummy_pipe = ModelWithCV(dummy_model_pipe, model_name='Dummy', X=X_train, y=y_train)

In [None]:
dummy_pipe.print_cv_summary()

## Logistic

In [None]:
logreg_model_pipe = Pipeline(steps=[('ct', CT), 
                                    ('logreg', LogisticRegression(random_state=42, cv=10, max_iter=1000))])

In [None]:
logreg_model_pipe.fit(X_train, y_train)
logreg_model_pipe.score(X_train, y_train)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
sm = SMOTE(sampling_strategy='auto',random_state=42)

In [None]:
sm2 = SMOTE(sampling_strategy=0.8, random_state=42)

In [None]:
X_train_clean = X_train.select_dtypes(['float', 'int']).dropna()
y_train_clean = y_train[X_train_clean.index]

In [None]:
...
# transform the dataset
strategy = {0:0.6, 1:0., 2:0.20}
oversample = SMOTE(sampling_strategy=strategy)
X, y = oversample.fit_resample(X, y)

In [None]:
counter = Counter(y)
for k,v in counter.items():
    per = v / len(y) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

GRIDSEARCH KNN

In [10]:
knneighbors_model_pipe = Pipeline(steps=[('ct', CT), ('knn', KNeighborsClassifier())])

In [11]:
knneighbors_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'age_at_inspection',
                                                   'water_per_person']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                 

In [12]:
knneighbors_model_pipe.score(X_train, y_train)

0.8241037958203327

In [13]:
pipe_grid = {
    'knn__n_neighbors': [3, 5, 7], 
    'knn__metric': ['minkowski', 'manhattan', 'euclidean'],
    'knn__weights': ['uniform', 'distance']

}


gs_pipe = GridSearchCV(estimator=knneighbors_model_pipe, param_grid=pipe_grid)

In [14]:
gs_pipe.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer()),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'age_at_inspection',
                                                                          'water_per_person']),
                                                                        ('subpip

In [15]:
gs_pipe.best_params_

{'knn__metric': 'manhattan', 'knn__n_neighbors': 7, 'knn__weights': 'distance'}

In [16]:
print(gs_pipe.best_params_)
print(gs_pipe.best_score_)
print(gs_pipe.best_estimator_.score(X_train, y_train))

{'knn__metric': 'manhattan', 'knn__n_neighbors': 7, 'knn__weights': 'distance'}
0.7700286057340722
0.9393925789580013


In [17]:
pd.DataFrame(gs_pipe.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__metric,param_knn__n_neighbors,param_knn__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,5.046689,0.098149,27.07166,0.278594,minkowski,3,uniform,"{'knn__metric': 'minkowski', 'knn__n_neighbors...",0.765432,0.761504,0.762851,0.761055,0.762151,0.762598,0.001541,14
1,5.237351,0.366315,27.889166,1.514914,minkowski,3,distance,"{'knn__metric': 'minkowski', 'knn__n_neighbors...",0.758025,0.753086,0.761392,0.75881,0.756314,0.757525,0.002758,17
2,4.891255,0.339943,32.957679,0.370794,minkowski,5,uniform,"{'knn__metric': 'minkowski', 'knn__n_neighbors...",0.77385,0.764085,0.765881,0.760943,0.767314,0.766415,0.004283,6
3,5.008935,0.078231,32.707769,0.526529,minkowski,5,distance,"{'knn__metric': 'minkowski', 'knn__n_neighbors...",0.768911,0.760831,0.767901,0.760831,0.766304,0.764955,0.003469,11
4,5.039832,0.036451,37.685687,0.240116,minkowski,7,uniform,"{'knn__metric': 'minkowski', 'knn__n_neighbors...",0.773513,0.763636,0.765881,0.760606,0.767987,0.766325,0.004349,8
5,4.657758,0.53981,34.482282,0.637324,minkowski,7,distance,"{'knn__metric': 'minkowski', 'knn__n_neighbors...",0.77486,0.7633,0.773064,0.763636,0.772702,0.769512,0.00499,2
6,4.225638,0.033435,23.70681,0.511297,manhattan,3,uniform,"{'knn__metric': 'manhattan', 'knn__n_neighbors...",0.766779,0.762065,0.764085,0.763749,0.764732,0.764282,0.001528,13
7,4.318217,0.060685,23.073787,0.217216,manhattan,3,distance,"{'knn__metric': 'manhattan', 'knn__n_neighbors...",0.758025,0.753984,0.763636,0.759596,0.758895,0.758827,0.003093,16
8,5.055396,0.973614,31.423065,1.850562,manhattan,5,uniform,"{'knn__metric': 'manhattan', 'knn__n_neighbors...",0.772952,0.765881,0.768687,0.762963,0.768436,0.767784,0.003311,4
9,4.767206,0.050468,29.771654,0.188024,manhattan,5,distance,"{'knn__metric': 'manhattan', 'knn__n_neighbors...",0.768238,0.762963,0.770034,0.761728,0.766865,0.765966,0.003146,10


In [44]:
log_model_pipe = Pipeline(steps=[('ct', CT), ('log', LogisticRegression(random_state=42, max_iter=800))])


In [45]:
log_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'age_at_inspection',
                                                   'water_per_person']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                 

In [46]:
log_model_pipe.score(X_train, y_train)

0.7485689914476195

In [59]:
pipe_grid_log = {
    'log__penalty': ['None','l1', 'l2'], 
    'log__solver': ['newton-cg', 'sag', 'lbfgs', 'liblinear'],
    'log__max_iter': [800, 900, 1000, 1100],
    'log__multi_class': ['auto', 'multinomial'],
    'log__C': range(1,101, 3),
}


In [60]:
gs_pipe_log.fit(X_train, y_train)

Traceback (most recent call last):
  File "/Users/mysterious/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mysterious/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/mysterious/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/mysterious/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 438, in _check_solver
    raise ValueError("Logistic Regression supports only penalties in %s,"
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got None.

Traceback (most recent call last):
  File "/Users/my

Traceback (most recent call last):
  File "/Users/mysterious/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mysterious/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/mysterious/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/mysterious/opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 438, in _check_solver
    raise ValueError("Logistic Regression supports only penalties in %s,"
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got None.

Traceback (most recent call last):
  File "/Users/my

KeyboardInterrupt: 

In [None]:
print(gs_pipe_log.best_params_)
print(gs_pipe_log.best_score_)
print(gs_pipe_log.best_estimator_.score(X_train, y_train))

In [None]:
dt_model_pipe = Pipeline(steps=[('ct', CT), ('dt', DecisionTreeClassifier(random_state=42, min_impurity_decrease= 0.3))])

In [None]:
dt_model_pipe.fit(X_train, y_train)

In [None]:
dt_model_pipe.score(X_train, y_train)

In [None]:
pipe_grid_dt = {
    'dt__criterion': ['gini', 'entropy', 'log_loss'], 
    'dt__class_weight': ['None', 'balanced'],
    'dt__min_impurity_decrease': [0.15, 0.2, 0.3]

}


gs_pipe_dt = GridSearchCV(estimator=dt_model_pipe, param_grid=pipe_grid_dt)

In [None]:
gs_pipe_dt.fit(X_train, y_train)

In [None]:
print(gs_pipe_dt.best_params_)
print(gs_pipe_dt.best_score_)
print(gs_pipe_dt.best_estimator_.score(X_train, y_train))

In [None]:
rfc = RandomForestClassifier(random_state=42, n_estimators=500)

rfc_model_pipe = Pipeline([('ct', CT), ('rfc', rfc)])

In [None]:
rfc_model_pipe.fit(X_train, y_train)

In [None]:
params = { 'rfc__n_estimators': [100, 250, 400, 700, 1000],
          
           'rfc__min_samples_leaf': [5 5, 10]
       
            'ct__subpipe_num__num_impute__strategy': ['mean']
        }

In [None]:
def grab_numeric(df):
    return water_pump.select_dtypes(include=['float', 'int'])

In [None]:
GrabNumeric = FunctionTransformer(grab_numeric)

In [None]:
##graph for top features
# Creating a large figure
fig = plt.figure(figsize=(15, 8))

# Iterating over the different
for i in range(0, 4):
    # Figure number starts at 1
    ax = fig.add_subplot(2, 2, i+1)
    # Add a title to make it clear what each subplot shows
    plt.title(df.columns[i])
    # Use alpha to better see crossing pints
    ax.scatter(df['target'], df.iloc[:, i], c='teal', alpha=0.1)
    # Only show the tick marks for each target
    plt.xticks(df.target.unique())

In [None]:
log_loss(y, logreg.predict_proba(X))