Modeling Notebook

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


from sklearn.metrics import confusion_matrix, plot_confusion_matrix, precision_score 
from sklearn.metrics import recall_score, accuracy_score, f1_score, log_loss
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, plot_roc_curve
from sklearn.metrics import classification_report, auc

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import MissingIndicator, SimpleImputer



%matplotlib inline

In [2]:
water_pump = pd.read_csv('Data/cleaned_water_pump.csv')
water_pump.head()

Unnamed: 0,amount_tsh,gps_height,basin,region,region_code,lga,public_meeting,scheme_management,permit,extraction_type_group,...,payment,quality_group,quantity,source,source_class,waterpoint_type,status_group,unknown_construction_yr,age_at_inspection,water_per_person
0,6000.0,1390,Lake Nyasa,Iringa,11,Ludewa,True,VWC,False,gravity,...,pay annually,good,enough,spring,groundwater,communal standpipe,0,False,12.0,55.045872
1,0.0,1399,Lake Victoria,Mara,20,Serengeti,Unknown,Other,True,gravity,...,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,0,False,3.0,0.0
2,25.0,686,Pangani,Manyara,21,Simanjiro,True,VWC,True,gravity,...,pay per bucket,good,enough,dam,surface,communal standpipe multiple,0,False,4.0,0.1
3,0.0,263,Ruvuma / Southern Coast,Mtwara,90,Nanyumbu,True,VWC,True,submersible,...,never pay,good,dry,machine dbh,groundwater,communal standpipe multiple,2,False,27.0,0.0
4,0.0,0,Lake Victoria,Kagera,18,Karagwe,True,Unknown,True,gravity,...,never pay,good,seasonal,rainwater harvesting,surface,communal standpipe,0,True,,


In [3]:
water_pump.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59399 entries, 0 to 59398
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   amount_tsh               59399 non-null  float64
 1   gps_height               59399 non-null  int64  
 2   basin                    59399 non-null  object 
 3   region                   59399 non-null  object 
 4   region_code              59399 non-null  int64  
 5   lga                      59399 non-null  object 
 6   public_meeting           59399 non-null  object 
 7   scheme_management        59399 non-null  object 
 8   permit                   59399 non-null  object 
 9   extraction_type_group    59399 non-null  object 
 10  extraction_type_class    59399 non-null  object 
 11  management               59399 non-null  object 
 12  management_group         59399 non-null  object 
 13  payment                  59399 non-null  object 
 14  quality_group         

In [4]:
water_pump['region_code'] = water_pump['region_code'].astype('str')

In [None]:
numeric_cols = ["amount_tsh", "gps_height", "population", 
                "age_at_inspection", "water_per_person", "status_group"]

cat_cols =["installer", "basin", "region", "region_code", "lga", "public_meeting", "scheme_management", "permit",
          "extraction_type_group", "extraction_type_class", "management", "management_group", "payment", "quality_group",
          "quantity", "source", "source_class", "waterpoint_type", "unknown_construction_yr"]

In [None]:
numeric_df = water_pump[numeric_cols]
X = numeric_df.drop('status_group', axis=1)
y = numeric_df['status_group']
X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(X, y, random_state=42)

## Dummy

In [None]:
dummy_model = DummyClassifier(strategy= "most_frequent")

dummy_model.fit(X_train_numeric, y_train)
dummy_model.score(X_test_numeric, y_test)

In [None]:
plot_confusion_matrix(dummy_model, X_train_numeric, y_train);

In [None]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

In [None]:
dummy_model_results = ModelWithCV(
                        model=dummy_model,
                        model_name='dummy',
                        X=X_train_numeric, 
                        y=y_train
)

In [None]:
fig, ax = plt.subplots()

ax = dummy_model_results.plot_cv(ax)
plt.tight_layout();

dummy_model_results.print_cv_summary()

In [None]:
fig, ax = plt.subplots()

fig.suptitle("Dummy Model")

#Plot confusion matrix
plot_confusion_matrix(dummy_model, X_train_numeric, y_train, ax =ax);

In [None]:
dummy_model_results.print_cv_summary()

## Pipeline

In [None]:
X= water_pump.drop('status_group', axis=1)
y=water_pump['status_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
numeric_cols = ["amount_tsh", "gps_height", "population", "construction_year", 
                "age_at_inspection", "water_per_person", "status_group"]

cat_cols =["installer", "basin", "region", "region_code", "lga", "public_meeting", "scheme_management", "permit",
          "extraction_type_group", "extraction_type_class", "management", "management_group", "payment", "quality_group",
          "quantity", "source", "source_class", "waterpoint_type", "unknown_construction_yr"]

In [None]:
subpipe_num = Pipeline(steps=[('num_impute', SimpleImputer() ), ('ss', StandardScaler() )])
subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='most_frequent')), 
                              ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
#subpipe_ord = Pipeline(steps=[('ord_ohe', etc. etc.
    #and then include that in our columntransformer and change the columns being acted upon

In [None]:
pipe = Pipeline(steps= [
    
])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
def grab_numeric(df):
    return water_pump.select_dtypes(include=['float', 'int'])

In [None]:
GrabNumeric = FunctionTransformer(grab_numeric)

In [None]:
##graph for top features
# Creating a large figure
fig = plt.figure(figsize=(15, 8))

# Iterating over the different
for i in range(0, 4):
    # Figure number starts at 1
    ax = fig.add_subplot(2, 2, i+1)
    # Add a title to make it clear what each subplot shows
    plt.title(df.columns[i])
    # Use alpha to better see crossing pints
    ax.scatter(df['target'], df.iloc[:, i], c='teal', alpha=0.1)
    # Only show the tick marks for each target
    plt.xticks(df.target.unique())

In [None]:
log_loss(y, logreg.predict_proba(X))