# **Experiment Notebook**



In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

<hr>

## A. Project


In [2]:
student_name = 'Max Chew'

In [3]:
student_id = '1355 2169'

In [4]:
experiment_id = '3'

<hr>

## B. Experiment Description


In [5]:
experiment_hypothesis = 'Churn is influenced by demographic factors such as Gender and Payment Method.'

In [6]:
experiment_expectations = 'This experiment uses a combination of demographic factors such as gender and behavioural features to assess there ability to predict churn. The use of KNN can capture local patterns within these features and may be able to more accurately predict churn behaviour. '

<hr>

## C. Data Understanding


### C.0 Import Packages

In [7]:
# Pandas for data handling
import pandas as pd

# Scikit Learn for ML training
import sklearn

# Altair for plotting
import altair as alt

# numpy for math
import numpy as np

<hr>

### C.1   Load Datasets

In [8]:
# Load training set
# Do not change this code

X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [9]:
# Load validation set
# Do not change this code

X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

In [10]:
# Load testing set
# Do not change this code

X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

<hr>

<hr>

## D. Feature Selection


In [11]:
feature_selection_executive_summary = 'The features selected demonstrate areas related to with engagement of customers. These features may be related to churn.'

> Rationale: Features irrelevant to the analysis for this specific hypothesis will be dropped. 

In [12]:
# select relevant features

features_list = ['Gender', 'PaymentMethod', 'DeviceRegistered', 'MonthlyCharges', 'SubscriptionType', 'CohortSimplifed']

# reduce datasets to selected features

X_train = X_train[features_list]
X_val = X_val[features_list]
X_test = X_test[features_list]

# get shape of datasets
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(18712, 6)
(6238, 6)
(6237, 6)


> Results: Results: Features selected. The training, validation and testing set each have 5 features and 18,712, 6,238 and 6,237 records respectively. 

<hr>

## E. Data Preparation

In [13]:
data_preparation_executive_summary = 'In order to prepare the data for quantitative analysis, one-hot encoding must be performed on categorical features. In this subset of features used for this experiment that is only the Subscription Type feature. '

> Rationale: Categorical data needs to be one-hot encoded in order to perform logistic regression.  

In [14]:
# Categorical features encoded using one-hot encoding

from sklearn.preprocessing import OneHotEncoder

# Create OneHotEncoder
KNN_encoder = OneHotEncoder(drop='first', sparse_output=False)

# save unique values from variable for verification in val and test set 
subscription_values = list(X_train['SubscriptionType'].unique())

# save unique values from variable for verification in val and test set 
gender_values = list(X_train['Gender'].unique())

# save unique values from variable for verification in val and test set 
payment_values = list(X_train['PaymentMethod'].unique())

# save unique values from variable for verification in val and test set 
cohort_values = list(X_train['CohortSimplifed'].unique())

# save unique values from variable for verification in val and test set 
devices_values = list(X_train['DeviceRegistered'].unique())

In [15]:
# Fit and transform the 'SubscriptionType' column
encoded_cols = KNN_encoder.fit_transform(X_train[['SubscriptionType','DeviceRegistered', 'Gender', 'PaymentMethod', 'CohortSimplifed']])

# Convert the result to a df and set names then rejoin
encoded_df = pd.DataFrame(encoded_cols, columns=KNN_encoder.get_feature_names_out(['SubscriptionType','DeviceRegistered', 'Gender', 'PaymentMethod', 'CohortSimplifed']))
X_train = X_train.drop(columns=['SubscriptionType','DeviceRegistered', 'Gender', 'PaymentMethod', 'CohortSimplifed'])
X_train = pd.concat([X_train, encoded_df], axis=1)
X_train

Unnamed: 0,MonthlyCharges,SubscriptionType_Premium,SubscriptionType_Standard,DeviceRegistered_Mobile,DeviceRegistered_TV,DeviceRegistered_Tablet,Gender_Male,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check,CohortSimplifed_0-1,CohortSimplifed_0-2,CohortSimplifed_0-3,CohortSimplifed_0-4,CohortSimplifed_1-0,CohortSimplifed_1-1,CohortSimplifed_1-2,CohortSimplifed_1-3,CohortSimplifed_1-4
0,0.016420,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.500649,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.169333,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.843668,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.380670,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18707,-0.078773,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18708,0.963047,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
18709,-1.008796,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
18710,1.440237,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
# remove rows from test and val sets which do not contain same values as test for categorical variable
X_val = X_val[X_val['SubscriptionType'].isin(subscription_values)]
X_test = X_test[X_test['SubscriptionType'].isin(subscription_values)]

X_val = X_val[X_val['Gender'].isin(gender_values)]
X_test = X_test[X_test['Gender'].isin(gender_values)]

X_val = X_val[X_val['PaymentMethod'].isin(payment_values)]
X_test = X_test[X_test['PaymentMethod'].isin(payment_values)]

X_val = X_val[X_val['CohortSimplifed'].isin(cohort_values)]
X_test = X_test[X_test['CohortSimplifed'].isin(cohort_values)]

X_val = X_val[X_val['DeviceRegistered'].isin(devices_values)]
X_test = X_test[X_test['DeviceRegistered'].isin(devices_values)]

In [17]:
# use one hot encoder to encode categorical variable for val 

encoded_cols = KNN_encoder.transform(X_val[['SubscriptionType','DeviceRegistered', 'Gender', 'PaymentMethod', 'CohortSimplifed']])
encoded_df = pd.DataFrame(encoded_cols, columns=KNN_encoder.get_feature_names_out(['SubscriptionType','DeviceRegistered', 'Gender', 'PaymentMethod', 'CohortSimplifed']))
X_val = X_val.drop(columns=['SubscriptionType','DeviceRegistered', 'Gender', 'PaymentMethod', 'CohortSimplifed'])
X_val = pd.concat([X_val, encoded_df], axis=1)
X_val

Unnamed: 0,MonthlyCharges,SubscriptionType_Premium,SubscriptionType_Standard,DeviceRegistered_Mobile,DeviceRegistered_TV,DeviceRegistered_Tablet,Gender_Male,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check,CohortSimplifed_0-1,CohortSimplifed_0-2,CohortSimplifed_0-3,CohortSimplifed_0-4,CohortSimplifed_1-0,CohortSimplifed_1-1,CohortSimplifed_1-2,CohortSimplifed_1-3,CohortSimplifed_1-4
0,1.625393,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.480921,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.679451,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.383981,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.633403,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6233,0.474775,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6234,0.800447,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6235,0.801074,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6236,-0.293580,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# use one hot encoder to encode categorical variable for test 

encoded_cols = KNN_encoder.transform(X_test[['SubscriptionType','DeviceRegistered', 'Gender', 'PaymentMethod', 'CohortSimplifed']])
encoded_df = pd.DataFrame(encoded_cols, columns=KNN_encoder.get_feature_names_out(['SubscriptionType','DeviceRegistered', 'Gender', 'PaymentMethod', 'CohortSimplifed']))
X_test = X_test.drop(columns=['SubscriptionType','DeviceRegistered', 'Gender', 'PaymentMethod', 'CohortSimplifed'])
X_test = pd.concat([X_test, encoded_df], axis=1)
X_test

Unnamed: 0,MonthlyCharges,SubscriptionType_Premium,SubscriptionType_Standard,DeviceRegistered_Mobile,DeviceRegistered_TV,DeviceRegistered_Tablet,Gender_Male,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check,CohortSimplifed_0-1,CohortSimplifed_0-2,CohortSimplifed_0-3,CohortSimplifed_0-4,CohortSimplifed_1-0,CohortSimplifed_1-1,CohortSimplifed_1-2,CohortSimplifed_1-3,CohortSimplifed_1-4
0,-0.783305,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.281495,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.027375,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.790799,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.722307,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6232,0.234552,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6233,1.068298,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6234,1.506704,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6235,0.548539,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


> Results: Successfully one-hot encoded variables 

<hr>

## F. Feature Engineering

In [19]:
data_preparation_executive_summary_2 = 'This is not necessary, all new features and transformations were performed in pre-processing in experiment 0. Data was scaled before export in Experiment 0.'

<hr>

## G. Train Machine Learning Model

In [20]:
train_model_executive_summary = 'The purpose of this section is to train a K-Nearest Neighbours Model on the processed data. Following this the model will be tuned using the validation set, before finally being assessed using the test set. '

### G.1 Import Algorithm

> Rationale: Importing KNN from sklearn to model the data processed 

In [21]:
# import KNN from sklearn

from sklearn.neighbors import KNeighborsClassifier

# hypopt for hyperparameter tuning
from hypopt import GridSearch

<hr>

### G.2 Set Hyperparameters

> Rationale: values for c and penalty which are to be tested 

In [22]:
# set y values to 1d array to prevent warning

y_train = np.ravel(y_train)
y_val = np.ravel(y_val)
y_test = np.ravel(y_test)


# set hyperparameters 

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'leaf_size': [10, 20, 30, 40],
    'metric': ['euclidean', 'manhattan']
}

<hr>

### G.3 Fit Model

In [23]:
# initiate knn model 

knn = KNeighborsClassifier()

gridsearchknn = GridSearch(knn)

gridsearchknn.fit(X_train, y_train, param_grid, X_val, y_val, scoring_params='f1_weighted')

Comparing 192 parameter setting(s) using 4 CPU thread(s) ( 48 job(s) per thread ).


<hr>

### G.4 Model Technical Performance

In [24]:
# import metrics for performance
from sklearn.metrics import roc_auc_score, recall_score, f1_score, confusion_matrix, classification_report

# Use best params from grid search on test set 


gridsearchknn.score(X_test, y_test)

0.7381754048420716

In [25]:
# get y preds from model 

training_preds = gridsearchknn.predict(X_test)

# evaluate model performance 

print(roc_auc_score(y_test, training_preds))

print(recall_score(y_test, training_preds))

print(f1_score(y_test, training_preds, average='weighted'))
print(confusion_matrix(y_test, training_preds))
print(classification_report(y_test, training_preds))

0.5076676129490734
0.13798977853492334
0.7168867477055109
[[4442  621]
 [1012  162]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.84      5063
           1       0.21      0.14      0.17      1174

    accuracy                           0.74      6237
   macro avg       0.51      0.51      0.51      6237
weighted avg       0.70      0.74      0.72      6237



> Results: model successfully run 

<hr>

### G.5 Business Impact from Current Model Performance

> Results: Model underperformed when compared to the baseline 

<hr>

## H. Experiment Outcomes

In [27]:
final_experiment_outcome = 'Hypothesis Confirmed'

> Key Learnings: poor performance could be due to high number of categorical features

> Recommendations for Next Experiment: choose more appropriate modeling technique for data selected.

<hr>

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8c3586fc-cd83-4e7b-a04a-11476af0d44a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>