### Telecom Churn Prediction: KNN

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Load Dataset

In [37]:
# import machine learning libraries
telecom = pd.read_csv("telecom_churn.csv")
telecom.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


### ML Workflow

- ML Approach
- Data Transformation
- Feature Engineering
- Model Training
- Model Building with KNN
- Model Evaluation


## Data Transformation

In [38]:
# define function --> transform data
def data_transformation(init_dataset, target):
  churn = init_dataset[target]
  init_dataset = init_dataset.drop(columns=[target], axis=1)
  transformed_data = pd.concat([init_dataset, churn], axis=1)
  return transformed_data

In [39]:
# transform data
telecom = data_transformation(telecom, "Churn")
telecom.head()

Unnamed: 0,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins,Churn
0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0,0
1,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7,0
2,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2,0
3,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6,0
4,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1,0


## Feature Engineering

In [40]:
telecom.head()

Unnamed: 0,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins,Churn
0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0,0
1,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7,0
2,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2,0
3,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6,0
4,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1,0


Determine feature 1: Customer Lifetime Value(CLV)

+ Average Revenue per User(ARPU):  This can be estimated by taking the average of MonthlyCharge over the AccountWeeks.
+ Average Customer Lifetime(ACL): This can be estimated from the AccountWeeks

formula: CLV=Average Revenue per User (ARPU)×Average Customer Lifetime

In [41]:
# feature 1: average customer lifetyme
telecom["ARPU"] = telecom["MonthlyCharge"]
telecom["CustomerLifeTime"] = telecom["AccountWeeks"]
telecom["CLV"] = telecom["ARPU"] * telecom["CustomerLifeTime"]

Determine feature 2: net promoter score

+ Definition: NPS measures customer satisfaction and loyalty based on the likelihood of recommending a service to others.
+ Calculation: This might require survey data, but if not available, proxy features can be used.
Implementation:
+ Proxy for NPS: Use CustServCalls as a proxy, where a higher number of calls might indicate lower satisfaction.
Derived Feature: You could create an NPS score by assuming a mapping between the number of customer service calls and a promoter/detractor status. 

For example:
+ Promoters: CustServCalls = 0 --> Score 9
+ Passives: CustServCalls = 1 --> Score 7
+ Detractors: CustServCalls > 1 --> Score 3

Alternatively, if NPS scores are directly available, you can include them as-is.

In [42]:
# feature 2: net promoter score
telecom["NPS"] = telecom["CustServCalls"].apply(lambda x: 9 if x==0 else 7 if x==1 else 3)

In [43]:
telecom = data_transformation(telecom, "Churn")
telecom.head()

Unnamed: 0,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins,ARPU,CustomerLifeTime,CLV,NPS,Churn
0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0,89.0,128,11392.0,7,0
1,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7,82.0,107,8774.0,7,0
2,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2,52.0,137,7124.0,9,0
3,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6,57.0,84,4788.0,3,0
4,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1,41.0,75,3075.0,3,0


## Model Training

In [55]:
# train the model on X and Y variable
def model_training(telecom_data, testsize): 
  X = telecom_data.drop(columns=["Churn"], axis=1)
  Y = telecom_data["Churn"]

  xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=testsize, random_state=1234)
  training = xtrain, ytrain
  testing = xtest, ytest
  return training, testing

In [56]:
training, testing = model_training(telecom, 0.25)
xtrain_tel, ytrain_tel = training
xtest_tel, ytest_tel = testing

for train, test, i in zip(training, testing, list(range(0,1))): 
  print(f"Training: X {train.shape} - Y {train.shape}")
  print(f"Testing: X {test.shape} - Y {test.shape}")

Training: X (2499, 14) - Y (2499, 14)
Testing: X (834, 14) - Y (834, 14)


In [None]:
# model building
# def knn_model_building(init_data, training, testing):