In [173]:
# --- Install, if necessary, and import required libraries --- #
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier # played a little bit, but RF performed better
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import sklearn.metrics as metrics
import pickle


In [174]:
# --- Declare working directory --- #
path = os.getcwd() # not used 

In [175]:
# --- Load dataset --- #
df = pd.read_csv("trainingSet_randomised.csv")
del df["Unnamed: 7"] #drop NAs

In [176]:
# Exploratory
# Remember to say about class balance 50/50 filipidis and why RF was picked
# Explain why non normalisation (not useful in RF)
# Explain why I chose RF, good and bad (not just better performance...)
# Overfit and say 92.5% was optimal (max benchmark) when using y_train instead of y_test

In [177]:
# --- Calculate additional variables / features --- #
df["point_usage"] = df["sum_collect_points"]/df["sum_redeem_points"]
df["point_usage_frequency"] = df["sum_collect"]/df["sum_redeem"]
df["average_collected_points_per_year"] = df["sum_collect_points"]/df["years_in_the_program"]
df["average_redeemed_points_per_year"] = df["sum_redeem_points"]/df["years_in_the_program"]
df["average_collections_per_year"] = df["sum_collect"]/df["years_in_the_program"]
df["average_redemptions_per_year"] = df["sum_redeem"]/df["years_in_the_program"]
features = [x for x in df.columns if x != "state"]

In [178]:
# --- Model Hyper Parameters Setting --- #
lr = LinearRegression(fit_intercept=True, normalize=False) # explain why this was bad (could have goine polynomial...)
rf = RandomForestClassifier()
    # parameters established after Grid Search Cross Validation - seen below

In [179]:
# --- Encode categorical variables - If selected model requires it --- #
#none one-hot encoding applied

In [180]:
# --- Model's k-fold cross validation evaluation procedure--- #

# param_grid = { 
#     'n_estimators': [200, 300, 400],
#     'min_samples_split': [5, 10],
#     'max_depth': [None, 5, 10]
# }

# clf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=9) # accuracy used as metric (9 chosen because of splits 500 each)
# print(clf.best_params_)

rf = RandomForestClassifier(
    n_estimators=200, criterion="gini", max_depth=None, max_features="sqrt", min_samples_split=10, random_state=70)
    # parameters established after Grid Search Cross Validation - seen above

In [181]:
# --- Split in training/validating and testing --- #
y = df["state"].values
del df["state"]
X = df.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=70)

In [182]:
# --- Model building --- #
rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=70, verbose=0, warm_start=False)

In [183]:
# --- Generate prediction --- #
y_pred = rf.predict(X_test)

In [184]:
# --- Print metrics and confusion matrix --- #
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score is {}%.".format(100*accuracy))

cm = confusion_matrix(y_test, y_pred)
#plot confusion matrix

Accuracy score is 86.4%.


In [185]:
# --- Compute and plot Area Under Curve (AUC) --- #
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=None)
auc(fpr, tpr)
# plot AUC

0.8640764270054888

In [186]:
# --- Variable Importance Plot --- #
print(rf.feature_importances_)
print(features)

[0.08650068 0.0023354  0.0737892  0.00398402 0.02610294 0.48227153
 0.06528522 0.08081451 0.06649115 0.02380293 0.06338339 0.02523904]
['sum_collect', 'sum_redeem', 'sum_collect_points', 'sum_redeem_points', 'years_in_the_program', 'months_since_last_transaction', 'point_usage', 'point_usage_frequency', 'average_collected_points_per_year', 'average_redeemed_points_per_year', 'average_collections_per_year', 'average_redemptions_per_year']


In [187]:
# --- Save final Model to a file --- #
filename = 'propensity_model.pkl'
pickle.dump(rf, open(filename, 'wb'))