In [21]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.utils import resample

from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xgb

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import time

import joblib

In [3]:
## Folder path
folder_path = Path.cwd().joinpath('data')

In [4]:
## Read training and validation data
train_val_data = pd.read_csv(folder_path.joinpath('train_val_data.csv'))
train_val_data.head()

Unnamed: 0,msno,is_churn,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,city,bd,gender,registered_via,registration_init_time,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,YSs/vsH+AL1WRvlkoLDGFT9wWihtQnqQZuKhqdcybm0=,0,40.0,30.0,149.0,149.0,1.0,2017-03-22,2017-04-22,0.0,18.0,0.0,,9.0,2012-04-18,116.0,26.0,25.0,37.0,3541.0,2912.0,866766.109
1,WxkFqbfXFSQgzlLld/tgOEFA9oGpf7JmPgsaMryCZWg=,0,41.0,30.0,149.0,149.0,1.0,2017-03-05,2017-04-05,0.0,4.0,40.0,female,7.0,2013-09-06,16.0,3.0,3.0,1.0,20.0,33.0,6766.055
2,58GesZdrmqPIpKnmdzPnUpw07joB4w8ayPNVfhMPPY4=,0,40.0,30.0,149.0,149.0,1.0,2017-03-24,2017-04-24,0.0,10.0,35.0,male,9.0,2014-12-24,127.0,45.0,29.0,22.0,1305.0,1060.0,318577.925
3,4LqpGjJ/MUOT3a0WEcSUI6xEykQACYUX0pCPm0xgVsg=,1,35.0,7.0,0.0,0.0,0.0,2017-02-20,2017-09-29,0.0,1.0,0.0,,9.0,2014-12-04,45.0,5.0,7.0,11.0,528.0,560.0,137037.344
4,limE8R97wWE+cNSl8CXf0CG/wmI4m1WaZQYPxcjz184=,0,41.0,30.0,149.0,149.0,1.0,2017-03-27,2017-04-27,0.0,1.0,0.0,,7.0,2011-12-17,98.0,25.0,14.0,10.0,484.0,399.0,125289.625


In [5]:
## Read test data
test_data = pd.read_csv(folder_path.joinpath('test_data.csv'))
test_data.head()

Unnamed: 0,msno,is_churn,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,city,bd,gender,registered_via,registration_init_time,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,/lUtWg8ExxX7WttR/T0UEV5ZmHSjlI+2kEbHdOLQu4o=,0,41.0,30.0,100.0,100.0,1.0,2017-03-20,2017-04-20,0.0,6.0,26.0,female,7.0,2017-01-21,76.0,14.0,12.0,3.0,224.0,109.0,51682.863
1,ZKuhr2ZHSE6jAd25Mv1DC/f7q2oZUV9NpxF//1KAC/E=,0,41.0,30.0,99.0,99.0,1.0,2017-03-31,2017-04-30,0.0,1.0,0.0,,7.0,2015-10-28,43.0,20.0,18.0,16.0,94.0,180.0,33348.512
2,Kea1xZZjKRAB3aaYIB1BSM580q2pqxxW/dYId7fEcqU=,0,39.0,30.0,149.0,149.0,1.0,2017-03-31,2017-05-05,0.0,15.0,50.0,male,9.0,2012-03-03,72.0,24.0,21.0,36.0,292.0,356.0,85705.528
3,IesOUKSq3onwcJe/MXEBAEw0U/8+qhFa4GQPAxm8EAo=,0,41.0,30.0,99.0,99.0,1.0,2017-03-24,2017-04-24,0.0,1.0,0.0,,7.0,2016-04-25,50.0,20.0,11.0,8.0,666.0,608.0,163959.09
4,HWAZShjWZK98M68I5CJq03m7Mgiq/vmN53/GkXGi4+E=,0,41.0,30.0,99.0,99.0,1.0,2017-03-31,2017-04-30,0.0,1.0,0.0,,7.0,2016-01-31,46.0,6.0,4.0,6.0,135.0,132.0,34568.674


In [6]:
train_val_data.dtypes

msno                       object
is_churn                    int64
payment_method_id         float64
payment_plan_days         float64
plan_list_price           float64
actual_amount_paid        float64
is_auto_renew             float64
transaction_date           object
membership_expire_date     object
is_cancel                 float64
city                      float64
bd                        float64
gender                     object
registered_via            float64
registration_init_time     object
num_25                    float64
num_50                    float64
num_75                    float64
num_985                   float64
num_100                   float64
num_unq                   float64
total_secs                float64
dtype: object

In [7]:
## Percentage of NA values for each column
(train_val_data.isna()).mean() * 100

msno                       0.00000
is_churn                   0.00000
payment_method_id          3.85449
payment_plan_days          3.85449
plan_list_price            3.85449
actual_amount_paid         3.85449
is_auto_renew              3.85449
transaction_date           3.85449
membership_expire_date     3.85449
is_cancel                  3.85449
city                      11.32785
bd                        11.32785
gender                    59.93507
registered_via            11.32785
registration_init_time    11.32785
num_25                    22.29340
num_50                    22.29340
num_75                    22.29340
num_985                   22.29340
num_100                   22.29340
num_unq                   22.29340
total_secs                22.29340
dtype: float64

In [8]:
## Split train_val data into train and val data
train_df, val_df = train_test_split(train_val_data, test_size=0.15, random_state=10)

In [9]:
train_df.head()

Unnamed: 0,msno,is_churn,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,city,bd,gender,registered_via,registration_init_time,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
66983,5b20roNDwa3p3z0Y3aWtYMuG0dGRB0tDAIwiYHm38mw=,0,41.0,30.0,149.0,149.0,1.0,2017-03-06,2017-04-06,0.0,1.0,0.0,,7.0,2016-04-06,367.0,57.0,30.0,26.0,772.0,1082.0,196341.137
242418,O+PqZOEN+6Fq5wr+dpntGdnzaSpnUPUIj3el0IAylYY=,0,41.0,30.0,99.0,99.0,1.0,2017-03-10,2017-04-10,0.0,1.0,0.0,,7.0,2014-01-27,263.0,32.0,33.0,44.0,713.0,954.0,194550.134
455493,lNmJE7vrHFf9/Zp2iGgayiNJMt9Wi17i7EHTBenArIY=,0,39.0,30.0,149.0,149.0,1.0,2017-03-31,2017-05-15,0.0,13.0,27.0,female,9.0,2013-09-14,690.0,136.0,93.0,92.0,2047.0,1829.0,570913.226
866243,Zf62RXGDnyehoy837HpuoOQy9WdJFPiltWqBcZP+2F0=,0,41.0,30.0,99.0,99.0,1.0,2017-03-14,2017-04-14,0.0,1.0,0.0,,7.0,2016-05-15,35.0,17.0,4.0,13.0,310.0,190.0,92633.412
83612,rlyej0t+xs1y8MIjClijpXSatK8QOzBI0kok5HO7FtA=,0,40.0,30.0,149.0,149.0,1.0,2017-03-12,2017-04-11,0.0,5.0,57.0,male,9.0,2007-10-07,16.0,4.0,3.0,4.0,593.0,423.0,129521.047


## Data imputation

In [10]:
## Numeric features imputer
numeric_feature_imputer = SimpleImputer(strategy='median')
numeric_columns = ['plan_list_price', 'actual_amount_paid', 'bd', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq', 'total_secs']

## Categorical features imputer
categorical_feature_imputer = SimpleImputer(strategy='most_frequent')
categorical_columns = ['payment_method_id', 'payment_plan_days', 'is_auto_renew', 'is_cancel', 'city', 'registered_via']

## Constant imputer
constant_imputer = SimpleImputer(strategy='constant', fill_value='not_specified')
constant_impute_column = ['gender']
constant_imputer_pipeline = Pipeline([('imputer', constant_imputer), ('one_hot_encoder', OneHotEncoder())])

## Combine all the imputation transformers with Column Transformer
column_transformer = ColumnTransformer(transformers=[('numeric_imputer', numeric_feature_imputer, numeric_columns),
                                                     ('categorical_imputer', categorical_feature_imputer, categorical_columns),
                                                     ('constant_imputer', constant_imputer_pipeline, constant_impute_column)])

## Final pipeline with scaling
pipeline = Pipeline([('imputation_step', column_transformer), ('scale_data', StandardScaler())])

In [12]:
pipeline.fit(train_df)

Pipeline(memory=None,
         steps=[('imputation_step',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric_imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0),
                                                  ['plan_list_price',
                                                   'actual_amount_paid', 'bd',
                                                   'num_

In [13]:
## Pre-process train data
x_train_tot = pipeline.transform(train_df)
y_train_tot = train_df.is_churn.values

## Downsample data


In [14]:
## Downsample data with class 0 
x_train_0 = resample(x_train_tot[y_train_tot == 0], replace=False, n_samples=int(len(x_train_tot[y_train_tot==0])*0.30), random_state=10)

x_train = np.vstack((x_train_0, x_train_tot[y_train_tot == 1]))
y_train = np.hstack((np.zeros(len(x_train_0), dtype=int), np.ones(len(x_train_tot[y_train_tot == 1]), dtype=int)))

In [15]:
## This helper function explains the confusion matrix obtained from confusion_matrix method
def confusion_matrix_report(confusion_arr:np.ndarray):
    """
        Function to explain the confusion matrix 
        
        Parameters
        -----------
        confusion_arr  :   np.ndarray containing values of confusion matrix
        
        Returns a dictionary explaining each value in confusion_arr
    """
    return {"True Positive": confusion_arr[1,1],
            "False Positive": confusion_arr[0,1],
            "True Negative": confusion_arr[0,0],
            "False Negative": confusion_arr[1,0]}

## ML autopilot 
This ML autopilot runs as follows:

* For the models specified, find the best set of parameters (from the specified list of parameters to search) using grid search and cross validation
* For each of the models, the best performing version is stored which can be used for evaluation

In [16]:
class MLAutopilot:
    def __init__(self, model_name=None, model_params=None):
        if model_name is not None and model_params is not None:
            self.models = model_names
            self.model_params = model_params
        else:      
            self.models = {'Logistic Regression': LogisticRegression(random_state=1, n_jobs=-1),
                           'Decision Tree Classifier': DecisionTreeClassifier(random_state=3),
                           'Random Forest Classifier': RandomForestClassifier(random_state=4, n_jobs=-1),
                           'Gradient Tree Boosting Classifer': GradientBoostingClassifier(random_state=5),
                           'XGBoost Classifier': xgb.XGBClassifier(random_state=6, n_jobs=-1)}

            self.params = {'Logistic Regression': [{'penalty': ['l2'], 'C': [1.0, 0.5]},
                                                   {'penalty': ['l1'], 'C': [1.0, 0.5], 'solver': ['liblinear']}],
                           'Decision Tree Classifier': {'criterion': ['gini', 'entropy'], 'min_samples_split': [2,5,10,25,50], 'min_samples_leaf': [1, 5, 10, 25]},
                           'Random Forest Classifier': {'n_estimators': [100, 250, 500, 1000], 'criterion': ['gini', 'entropy'], 'min_samples_split': [2,5,10,25,50], 
                                                        'min_samples_leaf': [1, 5, 10, 25]},
                           'Gradient Tree Boosting Classifer': {'loss': ['deviance', 'exponential'], 'learning_rate': [0.1, 0.05, 0.01, 0.001],
                                                                'n_estimators': [100, 200, 500, 1000], 'subsample': [1.0, 0.8, 0.6, 0.4], 'max_depth': [3, 5, 10]},
                           'XGBoost Classifier': {'n_estimators': [100, 500, 1000], 'max_depth': [3, 5, 10], 'learning_rate': [0.5, 0.1, 0.01, 0.001],
                                                  'subsample': [0.6, 0.8, 1.0]}}
          

        self.models_grid_search = {}
        self.models_best_model = {}
        self.models_cv_score = {}


    def run_automl(self, x_train: np.ndarray, y_train: np.ndarray, scoring_metric: str = 'f1', cv: int = 5, param_search_type: str = 'grid'):
        """
            Method to run AutoML
        """
        print(f"Evaluation metric: {scoring_metric}")
        print(f"Number of models to run: {len(self.models)}")
        for n, model in enumerate(self.models.items()):
            print(f"Running {model[0]} model")
            mod_time = time.time()
            
            if param_search_type == 'grid':
                # Perform Grid Search
                self.models_grid_search[model[0]] = GridSearchCV(estimator=self.models[model[0]], param_grid=self.params[model[0]], 
                                                                       scoring=scoring_metric, cv=cv)
            elif param_search_type == 'random':
                # Perform Randomized Grid Search
                self.models_grid_search[model[0]] = RandomizedSearchCV(estimator=self.models[model[0]], param_distributions=self.params[model[0]], 
                                                                       scoring=scoring_metric, cv=cv)
            else:
                raise TypeError("Parameter Search type must be either grid or random")
            
            self.models_grid_search[model[0]].fit(x_train, y_train)
            
            # Store best estimator based on Grid CV search
            self.models_best_model[model[0]] = self.models_grid_search[model[0]].best_estimator_
            
            # Store CV score based on average CV score of the best model
            self.models_cv_score[model[0]] = self.models_grid_search[model[0]].best_score_

            print(f"Model run time: {round(time.time() - mod_time, 2)} seconds")
            print(f"CV score: {round(self.models_cv_score[model[0]], 4)}")

    
    def get_prediction(self, pred_data:np.ndarray, model_name:str, loaded_model: bool = True):
        """
            Method to get prediction from trained model
        """
        if not isinstance(model_name, str) or not isinstance(pred_data, np.ndarray):
            raise AssertionError("Model name must be a string and pred_data must be a numpy.ndarray")

        ## Compute and return the predictions with the best model for model_name
        if loaded_model:
            return self.loaded_models[model_name].predict(pred_data)
        else:
            return self.models_best_model[model_name].predict(pred_data)


    def get_score(self, pred_data:np.ndarray, truth_label:np.ndarray, model_name:str, metric:str, loaded_model: bool = True):
        """
            Get scores for an input dataset
        """
        if not isinstance(model_name, str) or not isinstance(metric, str) or not isinstance(pred_data, np.ndarray) or not isinstance(truth_label, np.ndarray):
            raise AssertionError("Model name and metric must be a string, and pred_data and truth_label must be a numpy.ndarray")
        
        ## Step 1: Compute predictions
        if loaded_model:
            predictions = self.loaded_models[model_name].predict(pred_data)
        else:
            predictions = self.models_best_model[model_name].predict(pred_data)

        ## Step 2: Compute metric
        return eval(metric)(truth_label, predictions)
                        
                  
    def save_model(self, model_name: str, save_loc: str=None):
        """
            Method to save ML model

            Parameters
            -----------
            model_name  :  str containing name of the model to save
            save_loc    :  str containing location to save model

            Returns a string if model has been saved successfully
        """
        if save_loc is None:
            Path.cwd().joinpath('saved_model').mkdir(parents=True, exist_ok=True)
            save_loc = Path.cwd().joinpath('saved_model', model_name+'.pkl')

        try:
            joblib.dump(self.models_best_model[model_name], save_loc)
        except:
            raise IOError("Could not save the model. Check model name and file path")
        else:
            print("Model saved successfully!")


    def load_trained_model(self, model_names_list: list, folder_path):
        """
            Method to load trained model
        """  
        self.loaded_models = dict()
        for model_name in model_names_list:
            saved_location = folder_path.parent.joinpath('saved_models', model_name+'.pkl')
            self.loaded_models[model_name] = joblib.load(saved_location)
        print("Models loaded successfully!")

        

In [None]:
## Run ML pipeline (Takes about a day to run all the models)
ml_autopilot = MLAutopilot()
ml_autopilot.run_automl(x_train, y_train, 'f1', 5, 'random')

Evaluation metric: f1
Number of models to run: 5
Running Logistic Regression model


  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


Model run time: 148.56 seconds
CV score: 0.5682
Running Decision Tree Classifier model
Model run time: 176.34 seconds
CV score: 0.7918
Running Random Forest Classifier model
Model run time: 10230.52 seconds
CV score: 0.7991
Running Gradient Tree Boosting Classifer model
Model run time: 29230.81 seconds
CV score: 0.7986
Running XGBoost Classifier model
Model run time: 11699.77 seconds
CV score: 0.7983


In [None]:
## Save trained models
for model_name in ml_autopilot.models_best_model:
    save_location = folder_path.parent.joinpath('saved_models', model_name+'.pkl')
    ml_autopilot.save_model(model_name, save_location)
print("Models saved!")    

Model saved successfully!
Model saved successfully!
Model saved successfully!
Model saved successfully!
Model saved successfully!
Models saved!


## Model evaluation

In [17]:
## Load saved models
model_names_list = ['Logistic Regression', 'Decision Tree Classifier', 'Random Forest Classifier', 'Gradient Tree Boosting Classifer', 'XGBoost Classifier']
ml_autopilot = MLAutopilot()
ml_autopilot.load_trained_model(model_names_list, folder_path)

Models loaded successfully!


In [18]:
## Preprocess validation data
x_val = pipeline.transform(val_df)
y_val = val_df.is_churn.values

In [19]:
## Validate on all models
eval_metric = 'f1_score'
print(f"Evaluation metric: {eval_metric}")
for model in model_names_list:
    print(f"Model name: {model}")
    print(f"{eval_metric} : {round(ml_autopilot.get_score(x_val, y_val, model, eval_metric), 4)}")

Evaluation metric: f1_score
Model name: Logistic Regression
f1_score : 0.5298
Model name: Decision Tree Classifier
f1_score : 0.7149
Model name: Random Forest Classifier
f1_score : 0.746
Model name: Gradient Tree Boosting Classifer
f1_score : 0.7449
Model name: XGBoost Classifier
f1_score : 0.7449


In [22]:
## Validate on all models
eval_metric = 'roc_auc_score'
print(f"Evaluation metric: {eval_metric}")
for model in model_names_list:
    print(f"Model name: {model}")
    print(f"{eval_metric} : {round(ml_autopilot.get_score(x_val, y_val, model, eval_metric), 4)}")

Evaluation metric: roc_auc_score
Model name: Logistic Regression
roc_auc_score : 0.7007
Model name: Decision Tree Classifier
roc_auc_score : 0.8404
Model name: Random Forest Classifier
roc_auc_score : 0.8414
Model name: Gradient Tree Boosting Classifer
roc_auc_score : 0.8404
Model name: XGBoost Classifier
roc_auc_score : 0.8407


In [29]:
## Validate on all models
eval_metric = 'confusion_matrix'
print(f"Evaluation metric: {eval_metric}")
for model in model_names_list:
    print(f"Model name: {model}")
    print(f"{eval_metric} : {confusion_matrix_report(ml_autopilot.get_score(x_val, y_val, model, eval_metric))}")

Evaluation metric: confusion_matrix
Model name: Logistic Regression
confusion_matrix : {'True Positive': 4838, 'False Positive': 1810, 'True Negative': 117655, 'False Negative': 6777}
Model name: Decision Tree Classifier
confusion_matrix : {'True Positive': 8214, 'False Positive': 3150, 'True Negative': 116315, 'False Negative': 3401}
Model name: Random Forest Classifier
confusion_matrix : {'True Positive': 8129, 'False Positive': 2049, 'True Negative': 117416, 'False Negative': 3486}
Model name: Gradient Tree Boosting Classifer
confusion_matrix : {'True Positive': 8105, 'False Positive': 2041, 'True Negative': 117424, 'False Negative': 3510}
Model name: XGBoost Classifier
confusion_matrix : {'True Positive': 8114, 'False Positive': 2057, 'True Negative': 117408, 'False Negative': 3501}


In [30]:
## Validate on all models
eval_metric = 'classification_report'
print(f"Evaluation metric: {eval_metric}")
for model in model_names_list:
    print(f"Model name: {model}")
    print(ml_autopilot.get_score(x_val, y_val, model, eval_metric))

Evaluation metric: classification_report
Model name: Logistic Regression
              precision    recall  f1-score   support

           0       0.95      0.98      0.96    119465
           1       0.73      0.42      0.53     11615

    accuracy                           0.93    131080
   macro avg       0.84      0.70      0.75    131080
weighted avg       0.93      0.93      0.93    131080

Model name: Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    119465
           1       0.72      0.71      0.71     11615

    accuracy                           0.95    131080
   macro avg       0.85      0.84      0.84    131080
weighted avg       0.95      0.95      0.95    131080

Model name: Random Forest Classifier
              precision    recall  f1-score   support

           0       0.97      0.98      0.98    119465
           1       0.80      0.70      0.75     11615

    accuracy                       

From the model evaluation on validation set, select
1. Random Forest model (due to its robustness compared to a single decision tree)
2. XGBoost model (due to it being faster than Gradient Boosting)

In [33]:
selected_models = ['Random Forest Classifier', 'XGBoost Classifier']

Next steps:
1. Test the selected models on test set
2. Model interpretation
3. Evaluate on an use case

## Test set evaluation

In [31]:
## Preprocess test data
x_test = pipeline.transform(test_data)
y_test = test_data.is_churn.values

In [35]:
## Test on the selected models
eval_metric = 'f1_score'
print(f"Evaluation metric: {eval_metric}")
for model in selected_models:
    print(f"Model name: {model}")
    print(round(ml_autopilot.get_score(x_test, y_test, model, eval_metric), 4))

Evaluation metric: f1_score
Model name: Random Forest Classifier
0.7513
Model name: XGBoost Classifier
0.7508


In [37]:
## Test on the selected models
eval_metric = 'roc_auc_score'
print(f"Evaluation metric: {eval_metric}")
for model in selected_models:
    print(f"Model name: {model}")
    print(round(ml_autopilot.get_score(x_test, y_test, model, eval_metric), 4))

Evaluation metric: roc_auc_score
Model name: Random Forest Classifier
0.8433
Model name: XGBoost Classifier
0.8429
