# **Experiment Notebook**



In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

<hr>

## A. Project


In [None]:
student_name = 'Yi Xiao'

In [None]:
student_id = '14356721'

In [None]:
experiment_id = '2'

<hr>

## B. Experiment Description


In [None]:
experiment_hypothesis = 'The hypothesis is that a KNN classifier can accurately predict churn using the provided training data and will score higher than the baseline model.'

In [None]:
experiment_expectations = 'We expect the KNN classifier to achieve reasonable classification accuracy.'

<hr>

## C. Data Understanding


### C.0 Import Packages

In [4]:
# Pandas for data handling
import pandas as pd

# Scikit Learn for ML training
import sklearn

# Altair for plotting
import altair as alt

# <fill_this>
#import 

<hr>

### C.1   Load Datasets

In [5]:
# Load training set
# Do not change this code

X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [6]:
# Load validation set
# Do not change this code

X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

In [7]:
# Load testing set
# Do not change this code

X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

<hr>

<hr>

## D. Feature Selection


In [None]:
feature_selection_executive_summary = 'Use the same list of features from experiment 0.'

In [None]:
features_list = ['AccountAge', 'MonthlyCharges', 'TotalCharges', 'ViewingHoursPerWeek',
       'AverageViewingDuration', 'ContentDownloadsPerMonth']

<hr>

## E. Data Preparation

In [None]:
data_preparation_executive_summary = 'Our goal is to ensure that the data is clean, properly scaled, and ready for KNN training, as KNN performance is highly dependent on feature scaling due to the algorithm\'s reliance on distances between data points.'

> Rationale: The KNN algorithm requires normalised data to ensure that all features contribute equally to the distance calculations.

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

> Results: he data has been scaled to a range suitable for KNN.

<hr>

## F. Feature Engineering

In [None]:
data_preparation_executive_summary_2 = 'No specific feature engineering needed.'

> Rationale: No specific feature engineering needed.

> Results: No specific feature engineering needed.

<hr>

## G. Train Machine Learning Model

In [None]:
train_model_executive_summary = 'We will train a K-Nearest Neighbors classifier using the scaled dataset, experimenting with different values for the number of neighbors (k).'

### G.1 Import Algorithm

> Rationale: KNN is chosen due to its simplicity and effectiveness for classification problems where the decision boundaries may not be linear.

In [2]:
from sklearn.neighbors import KNeighborsClassifier

<hr>

### G.2 Set Hyperparameters

> Rationale: The number of neighbors (k) significantly affects the model's performance. A small k may lead to overfitting, while a large k might underfit.

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

<hr>

### G.3 Fit Model

In [10]:
knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_scaled, y_train.values.ravel())

<hr>

### G.4 Model Technical Performance

In [19]:
val_accuracies = [model.score(X_val_scaled, y_val) for model in knn_models]

In [13]:
from sklearn.metrics import accuracy_score, f1_score

best_knn = grid_search.best_estimator_

train_preds_best = best_knn.predict(X_train_scaled)
val_preds_best = best_knn.predict(X_val_scaled)
test_preds_best = best_knn.predict(X_test_scaled)

train_accuracy_best = accuracy_score(y_train, train_preds_best)
val_accuracy_best = accuracy_score(y_val, val_preds_best)
test_accuracy_best = accuracy_score(y_test, test_preds_best)

train_f1_best = f1_score(y_train, train_preds_best, average='weighted')
val_f1_best = f1_score(y_val, val_preds_best, average='weighted')
test_f1_best = f1_score(y_test, test_preds_best, average='weighted')

print(f"Train Set - Accuracy: {train_accuracy_best:.4f}, F1-Score: {train_f1_best:.4f}")
print(f"Validation Set - Accuracy: {val_accuracy_best:.4f}, F1-Score: {val_f1_best:.4f}")
print(f"Test Set - Accuracy: {test_accuracy_best:.4f}, F1-Score: {test_f1_best:.4f}")

best_k = grid_search.best_params_['n_neighbors']
print(f"Best KNN Model found with k = {best_k}")

Train Set - Accuracy: 0.8342, F1-Score: 0.7952
Validation Set - Accuracy: 0.8168, F1-Score: 0.7733
Test Set - Accuracy: 0.8151, F1-Score: 0.7673
Best KNN Model found with k = 11


> Results: These results show that the model with k=7 has the best performance in terms of both accuracy and F1-score.

<hr>

### G.5 Business Impact from Current Model Performance

In [15]:
avg_subscription_fee = X_test['MonthlyCharges'].mean()
discount_per_month = (X_test['MonthlyCharges'].mean())/2  
discount_duration_months = 3
retention_offer_cost_per_customer = discount_per_month * discount_duration_months  
lost_revenue_per_churn = X_test['MonthlyCharges'].mean()

predicted_churners = sum(test_preds_best == 1)  
actual_churners = sum((y_val == 1).values) 
false_negatives = actual_churners - predicted_churners  

total_intervention_cost = predicted_churners * retention_offer_cost_per_customer
total_lost_revenue_from_false_negatives = false_negatives * lost_revenue_per_churn

print(f"Total predicted churners (Model 3): {predicted_churners}")
print(f"Total cost of retention interventions: ${total_intervention_cost}")
print(f"Total lost revenue from undetected churners (false negatives): ${total_lost_revenue_from_false_negatives}")

Total predicted churners (Model 3): 315
Total cost of retention interventions: $47250
Total lost revenue from undetected churners (false negatives): $[11411.01329786]


> Results: Model 3 predicted 489 customers as likely to churn, resulting in an intervention cost of $73,350 based on offering retention discounts.

<hr>

## H. Experiment Outcomes

In [1]:
final_experiment_outcome = 'Hypothesis Partially Confirmed'

> Key Learnings: KNN predicted a significant number of churners. While baseline model has better overall accuracy, the higher F1-Score shows that KNN might handle imbalanced data better.

> Recommendations for Next Experiment: To improve performance, we can implement advanced classification algorithms such as Gradient Boosting, which may better capture complex patterns in the data.

<hr>

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=803b396d-170a-478e-bdcc-2487a2a4ebcf' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>