# **Experiment Notebook**



In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

<hr>

## A. Project


In [None]:
student_name = 'Max Chew'

In [None]:
student_id = '1355 2169'

In [None]:
experiment_id = '2'

<hr>

## B. Experiment Description


In [None]:
experiment_hypothesis = 'Customers with high engagement are less likely to churn. '

In [None]:
experiment_expectations = 'The use of SVM to analyse features related to customer engagement, will show an negative correlation with churn. '

<hr>

## C. Data Understanding


### C.0 Import Packages

In [2]:
# Pandas for data handling
import pandas as pd

# Scikit Learn for ML training
import sklearn

# Altair for plotting
import altair as alt

# numpy for math
import numpy as np

<hr>

### C.1   Load Datasets

In [3]:
# Load training set
# Do not change this code

X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [4]:
# Load validation set
# Do not change this code

X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

In [5]:
# Load testing set
# Do not change this code

X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

<hr>

<hr>

## D. Feature Selection


In [None]:
feature_selection_executive_summary = 'The features selected demonstrate areas related to with engagement of customers. These features may be related to churn.'

> Rationale: Features irrelevant to the analysis for this specific hypothesis will be dropped. 

In [6]:
# select relevant features

features_list = ['ViewingHoursPerWeek', 'ContentDownloadsPerMonth', 'GenrePreference', 'CohortSimplifed', 'SubscriptionType']

# reduce datasets to selected features

X_train = X_train[features_list]
X_val = X_val[features_list]
X_test = X_test[features_list]

# get shape of datasets
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(18712, 5)
(6238, 5)
(6237, 5)


> Results: Results: Features selected. The training, validation and testing set each have 5 features and 18,712, 6,238 and 6,237 records respectively. 

<hr>

## E. Data Preparation

In [1]:
data_preparation_executive_summary = 'In order to prepare the data for quantitative analysis, one-hot encoding must be performed on categorical features. In this subset of features used for this experiment that is only the Subscription Type feature. '

> Rationale: Categorical data needs to be one-hot encoded in order to perform SVM.

In [7]:
# Categorical features encoded using one-hot encoding

from sklearn.preprocessing import OneHotEncoder

# Create OneHotEncoder
SVM_encoder = OneHotEncoder(drop='first', sparse_output=False)

# save unique values from variable for verification in val and test set 
subscription_values = list(X_train['SubscriptionType'].unique())

# save unique values from variable for verification in val and test set 
genre_values = list(X_train['GenrePreference'].unique())

# save unique values from variable for verification in val and test set 
cohort_values = list(X_train['CohortSimplifed'].unique())

In [8]:
# Fit and transform the categorical column
encoded_cols = SVM_encoder.fit_transform(X_train[['SubscriptionType', 'GenrePreference', 'CohortSimplifed']])

# Convert the result to a df and set names then rejoin
encoded_df = pd.DataFrame(encoded_cols, columns=SVM_encoder.get_feature_names_out(['SubscriptionType', 'GenrePreference', 'CohortSimplifed']))
X_train = X_train.drop(columns=['SubscriptionType', 'GenrePreference', 'CohortSimplifed'])
X_train = pd.concat([X_train, encoded_df], axis=1)
X_train

Unnamed: 0,ViewingHoursPerWeek,ContentDownloadsPerMonth,SubscriptionType_Premium,SubscriptionType_Standard,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,CohortSimplifed_0-1,CohortSimplifed_0-2,CohortSimplifed_0-3,CohortSimplifed_0-4,CohortSimplifed_1-0,CohortSimplifed_1-1,CohortSimplifed_1-2,CohortSimplifed_1-3,CohortSimplifed_1-4
0,-1.090333,-0.535212,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.211828,0.779096,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.759197,0.571574,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.215220,-1.641998,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.848636,1.055793,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18707,-1.652907,-0.120167,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18708,-1.728577,1.332489,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
18709,1.718039,-0.258516,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
18710,-0.872902,-0.327690,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
# remove rows from test and val sets which do not contain same values as test for categorical variable
X_val = X_val[X_val['SubscriptionType'].isin(subscription_values)]
X_test = X_test[X_test['SubscriptionType'].isin(subscription_values)]

X_val = X_val[X_val['GenrePreference'].isin(genre_values)]
X_test = X_test[X_test['GenrePreference'].isin(genre_values)]

X_val = X_val[X_val['CohortSimplifed'].isin(cohort_values)]
X_test = X_test[X_test['CohortSimplifed'].isin(cohort_values)]

In [10]:
# use one hot encoder to encode categorical variable for val 

encoded_cols = SVM_encoder.transform(X_val[['SubscriptionType', 'GenrePreference', 'CohortSimplifed']])
encoded_df = pd.DataFrame(encoded_cols, columns=SVM_encoder.get_feature_names_out(['SubscriptionType', 'GenrePreference', 'CohortSimplifed']))
X_val = X_val.drop(columns=['SubscriptionType', 'GenrePreference', 'CohortSimplifed'])
X_val = pd.concat([X_val, encoded_df], axis=1)
X_val

Unnamed: 0,ViewingHoursPerWeek,ContentDownloadsPerMonth,SubscriptionType_Premium,SubscriptionType_Standard,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,CohortSimplifed_0-1,CohortSimplifed_0-2,CohortSimplifed_0-3,CohortSimplifed_0-4,CohortSimplifed_1-0,CohortSimplifed_1-1,CohortSimplifed_1-2,CohortSimplifed_1-3,CohortSimplifed_1-4
0,-0.072945,-0.466038,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.280787,-1.157779,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.474801,0.848270,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.615391,-1.641998,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.575214,-0.396864,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6233,-0.824987,1.194141,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6234,-1.284138,1.540011,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6235,-1.316458,-0.120167,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6236,-1.309128,-1.572824,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# use one hot encoder to encode categorical variable for test 

encoded_cols = SVM_encoder.transform(X_test[['SubscriptionType', 'GenrePreference', 'CohortSimplifed']])
encoded_df = pd.DataFrame(encoded_cols, columns=SVM_encoder.get_feature_names_out(['SubscriptionType', 'GenrePreference', 'CohortSimplifed']))
X_test = X_test.drop(columns=['SubscriptionType', 'GenrePreference', 'CohortSimplifed'])
X_test = pd.concat([X_test, encoded_df], axis=1)
X_test

Unnamed: 0,ViewingHoursPerWeek,ContentDownloadsPerMonth,SubscriptionType_Premium,SubscriptionType_Standard,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,CohortSimplifed_0-1,CohortSimplifed_0-2,CohortSimplifed_0-3,CohortSimplifed_0-4,CohortSimplifed_1-0,CohortSimplifed_1-1,CohortSimplifed_1-2,CohortSimplifed_1-3,CohortSimplifed_1-4
0,0.922606,-0.950257,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.721057,0.364051,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.554434,1.332489,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.973363,0.709922,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.147370,0.087355,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6232,0.129871,-0.950257,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6233,0.339434,0.640748,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6234,-0.437432,-0.396864,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6235,-0.294453,-1.226953,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


> Results: Successfully one-hot encoded variables 

<hr>

## F. Feature Engineering

In [None]:
data_preparation_executive_summary_2 = 'This is not necessary, all new features and transformations were performed in pre-processing in experiment 0. Data was scaled before export in Experiment 0.'

<hr>

## G. Train Machine Learning Model

In [None]:
train_model_executive_summary = 'The purpose of this section is to train a Support Vector Machine Model on the processed data. Following this the model will be tuned using the validation set, before finally being assessed using the test set. '

### G.1 Import Algorithm

> Rationale: Importing SVM from sklearn to model the data processed 

In [12]:
# import SVM from sklearn

from sklearn.svm import SVC

<hr>

### G.2 Set Hyperparameters

> Rationale: values for c and penalty which are to be tested 

In [13]:
# import metrics for performance
from sklearn.metrics import roc_auc_score, recall_score, f1_score, confusion_matrix, classification_report


# Set Hyperparameters
best_params = None
best_score = 0

c_value = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
kernal_value1 = ['linear', 'rbf', 'sigmoid']
gamme_value = ['scale', 'auto', 0.001, 0.01, 0.1, 1]


degree_value = [2,3,4,5]


<hr>

### G.3 Fit Model

In [14]:
y_train = np.ravel(y_train)
y_val = np.ravel(y_val)
y_test = np.ravel(y_test)

model0 = SVC(C=1, kernel='rbf', gamma='scale',class_weight='balanced')
model0.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model0.predict(X_val)

# Calculate F1 score
score0 = f1_score(y_val, y_pred_val, average='weighted')
score0

0.6330788108112972

In [15]:
model1 = SVC(C=0.1, kernel='rbf', gamma='scale',class_weight='balanced')
model1.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model1.predict(X_val)

# Calculate F1 score
score1 = f1_score(y_val, y_pred_val, average='weighted')
score1

0.6273401312996262

In [16]:
model2 = SVC(C=10, kernel='rbf', gamma='scale',class_weight='balanced')
model2.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model2.predict(X_val)

# Calculate F1 score
score2 = f1_score(y_val, y_pred_val, average='weighted')
score2

0.6219139099072856

In [17]:
model3 = SVC(C=1, kernel='sigmoid', gamma='scale',class_weight='balanced')
model3.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model3.predict(X_val)

# Calculate F1 score
score3 = f1_score(y_val, y_pred_val, average='weighted')
score3

0.5860189232444102

In [18]:
model4 = SVC(C=1, kernel='linear', gamma='scale',class_weight='balanced')
model4.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model4.predict(X_val)

# Calculate F1 score
score4 = f1_score(y_val, y_pred_val, average='weighted')
score4

0.6362098916684595

In [19]:
model5 = SVC(C=1, kernel='poly', degree=2, gamma='scale',class_weight='balanced')
model5.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model5.predict(X_val)

# Calculate F1 score
score5 = f1_score(y_val, y_pred_val, average='weighted')
score5

0.6169696372954118

In [20]:
model6 = SVC(C=1, kernel='poly', degree=4, gamma='scale',class_weight='balanced')
model6.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model6.predict(X_val)

# Calculate F1 score
score6 = f1_score(y_val, y_pred_val, average='weighted')
score6

0.6310859489380303

In [21]:
model7 = SVC(C=1, kernel='poly', degree=6, gamma='scale',class_weight='balanced')
model7.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model7.predict(X_val)

# Calculate F1 score
score7 = f1_score(y_val, y_pred_val, average='weighted')
score7

0.6262831536837615

In [22]:
model8 = SVC(C=1, kernel='poly', degree=6, gamma='auto',class_weight='balanced')
model8.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model8.predict(X_val)

# Calculate F1 score
score8 = f1_score(y_val, y_pred_val, average='weighted')
score8

0.7494639410390758

In [23]:
model9 = SVC(C=1, kernel='poly', degree=6, gamma=0.001,class_weight='balanced')
model9.fit(X_train, y_train)

# Predict on validation set
y_pred_val = model9.predict(X_val)

# Calculate F1 score
score9 = f1_score(y_val, y_pred_val, average='weighted')
score9

0.7408803603852331

<hr>

### G.4 Model Technical Performance

In [24]:
# apply best params from hyperparameter tuning on testing set 

y_test_pred = model8.predict(X_test)

# evaluate model performance 

print(roc_auc_score(y_test, y_test_pred))

print(recall_score(y_test, y_test_pred))

print(f1_score(y_test, y_test_pred, average='weighted'))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

0.5097895982511328
0.03577512776831346
0.7357710117872126
[[4981   82]
 [1132   42]]
              precision    recall  f1-score   support

           0       0.81      0.98      0.89      5063
           1       0.34      0.04      0.06      1174

    accuracy                           0.81      6237
   macro avg       0.58      0.51      0.48      6237
weighted avg       0.73      0.81      0.74      6237



> Results: results from the SVM were only marginally better than the results from the baseline and experiment1 models. 

<hr>

### G.5 Business Impact from Current Model Performance

> Results: There is insufficient evidence to accept that these features have an influence on predicting churn. 

<hr>

## H. Experiment Outcomes

In [33]:
final_experiment_outcome = 'Hypothesis Rejected'

> Key Learnings: There is insufficient evidence to overturn the null hypothesis. As as result there is no clear correlation or predictive ability when using support vector machines. 

> Recommendations for Next Experiment: An attempt to use a grid search technique was too time intensive but perhaps could yield better results.

<hr>

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8c3586fc-cd83-4e7b-a04a-11476af0d44a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>