# Enrolments Data

## Exploration
### Statistical exploration

In [None]:
import pandas as pd
import numpy as np

#### Enrolments

In [None]:
#Read the data
enrolments = pd.read_csv("data/courses snap_sales_funnel 2017-06-27T1636.csv")

In [None]:
enrolments.PhoneValid.unique()

In [None]:
enrolments['PhoneValid'] = enrolments.PhoneValid.map({'Yes': 1,'No': 0})

In [None]:
enrolments.shape

In [None]:
enrolments.info()

In [None]:
#Checkout all the numeric columns
enrolments.describe()

In [None]:
#Remove some useless columns
enrolments.drop('Status', axis=1, inplace=True)
enrolments.drop('CampaignSourceCategory',axis=1,inplace=True)
enrolments.drop('CampaignSource',axis=1,inplace=True)

In [None]:
#Add cancelled column to track whether an enrolment cancelled
enrolments['Cancelled'] = enrolments.CanceledDate.isnull()
enrolments.columns

In [None]:
#Limit to only post mid-2014 data
enrolments = enrolments[enrolments.Startdate > 20140630]
enrolments.Metro.value_counts()

In [None]:
enrolments.isnull().sum()

In [None]:
enrolments.ExpectedPayment.fillna("Unknown", inplace=True)
enrolments.PardotCategory.fillna("Unknown", inplace=True)
enrolments.OppOwnerMetro.fillna("Unknown", inplace=True)
enrolments.OppOwnerEmail.fillna("Unknown", inplace=True)
enrolments.CanceledDate.fillna(0, inplace=True)
enrolments.DaysEnroltoStartdate.fillna(enrolments.DaysEnroltoStartdate.mean(), inplace=True)
enrolments.SpeedtoLead.fillna(enrolments.SpeedtoLead.mean(), inplace=True)

In [None]:
enrolments.isnull().sum()

In [None]:
enrolments.head(2)

#### What about APAC only?

In [None]:
#Add an APAC column
enrolments['APAC'] = enrolments['Metro'].isin(['sydney','melbourne','hong-kong','singapore','brisbane'])
enrolments.APAC.value_counts()

In [None]:
#only include APAC metros
apac = 'sydney','melbourne','brisbane','hong-kong','singapore'
apacenrols = enrolments[enrolments.Metro.isin(apac)]
apacenrols.info()

#### Only common application types

In [None]:
#Checkout only the more common application types
enrolments.ApplicationType.value_counts()<100

In [None]:
commonapptypes = enrolments.ApplicationType.value_counts().index[enrolments.ApplicationType.value_counts()>147]
commonapptypes

In [None]:
wcat = enrolments[enrolments.ApplicationType.isin(commonapptypes)]
wcat.shape

In [None]:
wcat.ApplicationType.value_counts()

In [None]:
wcat.info()

#### Pandas profile

In [None]:
!pip install pandas-profiling

In [None]:
import pandas_profiling
pandas_profiling.ProfileReport(enrolments)

### Visualisation

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import urllib
import plotly
import plotly.plotly as py
import plotly.graph_objs as go 
import folium
import plotly.figure_factory as ff

plotly.tools.set_credentials_file(username='Msquirchuk', api_key='kTj4gydNrsbGuMDjMcNn')

%matplotlib inline
sns.set(color_codes=True)

In [None]:
wcat['DaysInvoicetoEnrol'].hist()

In [None]:
apacenrols['DaysEnroltoStartdate'].hist()

In [None]:
sns.jointplot(x='DaysEnroltoStartdate', y="DaysInvoicetoEnrol", data=enrolments)

In [None]:
corr = enrolments.corr()
corr = (corr)
corr_cancelled = pd.DataFrame(corr['Cancelled'].drop('Cancelled'))
corr_cancelled.sort_values(by = 'Cancelled', ascending = False)
plt.figure(figsize = (20,20))
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
sns.plt.title('Heatmap of Correlation Matrix')

In [None]:
plt.figure(figsize = (15,6))
sns.barplot(x='Metro', y = 'Cancelled', data = enrolments)

### Tree Subset

In [None]:
treesubset = enrolments[['Startdate', 'Enddate','Price','LclPrice','EnrolDate','DaysEnroltoStartdate','DaysInvoicetoEnrol','DaysLeadtoEnroled','Cancelled','APAC']]
treesubset.head()

In [None]:
treesubset['DaysEnroltoStartdate'].fillna(value=0,inplace=True)

In [None]:
treesubset.describe()
treesubset.isnull().sum()

In [None]:
treesubset.Cancelled.value_counts()

In [None]:
sns.pairplot(treesubset, hue='APAC')

In [None]:
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

%matplotlib inline

cancelled = treesubset['Cancelled']
del treesubset['Cancelled']
treesubset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(treesubset, cancelled, random_state=1)
ctree = tree.DecisionTreeClassifier(random_state=1, max_depth=2)
# Fit the decision tree classifier
ctree.fit(X_train, y_train)
# Create a feature vector
features = treesubset.columns.tolist()
features

In [None]:
from io import StringIO
out = StringIO()
tree.export_graphviz(ctree, out_file = out)
from sklearn.tree import export_graphviz
export_graphviz(ctree, out_file='decisiontree.dot', feature_names=features)

#### Previously exported: 
<img src="decisiontree-enrolments.png">

### Linear or Logistic Regression?

In [None]:
import pandas as pd
# data = enrolments

enrolmentswd = pd.get_dummies(data=enrolments, columns = ['ApplicationType'], prefix = ['ApplicationType'] )
#define the columns to read (x), and the target (y)
feature_cols = ['DaysEnroltoStartdate','DaysInvoicetoEnrol','DaysLeadtoEnroled']
X = enrolmentswd[feature_cols]
y = enrolmentswd.Cancelled

In [None]:
enrolmentswd['DaysEnroltoStartdate'].fillna(value=0,inplace=True)
enrolmentswd['ExpectedPayment'].fillna(value=0,inplace=True)
enrolmentswd.isnull().sum()

In [None]:
#split the data
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
#fit the logreg model and print the coefficients
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
zip(feature_cols, logreg.coef_[0])
print(logreg.fit(X_train, y_train))

In [None]:
#test accuracy
y_pred_class = logreg.predict(X_test)
from sklearn import metrics
print(metrics.accuracy_score(y_test,y_pred_class))

In [None]:
from sklearn.dummy import DummyClassifier
dumb = DummyClassifier(strategy='most_frequent')
dumb.fit(X_train, y_train)
y_dumb_class = dumb.predict(X_test)
print (metrics.accuracy_score(y_test, y_dumb_class))

In [None]:
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print(scores)

In [None]:
print(scores.mean())

In [None]:
#print the confusion matrix
from sklearn import metrics
prds = logreg.predict(X)
print(metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
#generate the ROC curve
import matplotlib.pyplot as plt
# Generate the prediction values for each of the test observations using predict_proba() function rather than just predict
preds = logreg.predict_proba(X_test)[:,1]
# Store the false positive rate(fpr), true positive rate (tpr) in vectors for use in the graph
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
# Store the Area Under the Curve (AUC) so we can annotate our graph with this metric
roc_auc = metrics.auc(fpr,tpr)
# Plot the ROC Curve
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
# Our aim when modelling is to maximise the area under the curve, the closer to one the better the model.

#### Regression 

In [None]:
print('RMSE (No reg.) =', np.sqrt(metrics.mean_squared_error(y_test,y_pred_class)))

In [None]:
from sklearn.linear_model import RidgeCV
alpha_range = 10.**np.arange(-5, 5)
rregcv = RidgeCV(normalize=True, scoring='neg_mean_squared_error', alphas=alpha_range)
rregcv.fit(X_train, y_train)
# Print the optimal value of Alpha for Ridge Regression
print('Optimal Alpha Value: ', rregcv.alpha_)
# Print the RMSE for the ridge regression model
preds = rregcv.predict(X_test)
print ('RMSE (Ridge CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds)))

In [None]:
from sklearn.linear_model import LassoCV
alpha_range = 10.**np.arange(-4, 4)
lascv = LassoCV(normalize=True, alphas=alpha_range)
lascv.fit(X_train, y_train)
print('Optimal Alpha Value: ',lascv.alpha_)
lascv.coef_
preds = lascv.predict(X_test)
print('RMSE (Lasso CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds)))

In [None]:
from sklearn.linear_model import ElasticNetCV
alpha_range = 10.**np.arange(-3, 3)
enetCV = ElasticNetCV(normalize=True, alphas=alpha_range)
enetCV.fit(X_train, y_train)
print('Optimal Alpha Value: ',enetCV.alpha_)
enetCV.coef_
preds = enetCV.predict(X_test)
print('RMSE (ENET CV reg.)', np.sqrt(metrics.mean_squared_error(y_test, preds)))

### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN_model = KNeighborsClassifier(5)
KNN_model.fit(X_train, y_train)
y_pred_class = KNN_model.predict(X_test)
# Print the new accuracy rate
print(metrics.accuracy_score(y_test, y_pred_class))