# Telco Customer Churn Prediction

In [None]:
# Importing libs
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    GradientBoostingClassifier,
    AdaBoostClassifier,
    StackingClassifier,
    VotingClassifier
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier

## Get the data

In [None]:
data_path = "../artifacts/data"
files = os.listdir(data_path)
files
for file in files:
    file_name = os.path.splitext(file)[0]
    globals()[f'{file_name}_df'] = pd.read_excel(os.path.join(data_path, file))

df_s = [
    
    Telco_customer_churn_df,

    Telco_customer_churn_demographics_df,

    Telco_customer_churn_location_df,
    
    Telco_customer_churn_population_df,
    
    Telco_customer_churn_services_df,
    
    Telco_customer_churn_services_df,
    
    CustomerChurn_df
]

## Data Analysis and EDA/Visualization

In [None]:
Telco_customer_churn_df.head()

In [None]:
Telco_customer_churn_df.info()

In [None]:
Telco_customer_churn_df.columns

In [None]:
# Investigating all categorical values in the dataset
categorical_data = []
for column in Telco_customer_churn_df:
    value_counts = pd.value_counts(Telco_customer_churn_df[column])
    value_counts_len = len(value_counts.index.to_list())

    if value_counts_len < 10 and value_counts_len!=1:
        categorical_data.append(column)
        print(f"Unique values for column '{column}' : {value_counts.index.to_list()} having len {value_counts_len}")

In [None]:
# Telco_customer_churn_df_copy = Telco_customer_churn_df.copy().fillna(''),
# Telco_customer_churn_demographics_df_copy = Telco_customer_churn_demographics_df.copy().fillna(''),
# Telco_customer_churn_location_df_copy = Telco_customer_churn_location_df.copy().fillna(''),
# Telco_customer_churn_population_df_copy = Telco_customer_churn_population_df.copy().fillna(''),
# Telco_customer_churn_services_df_copy = Telco_customer_churn_services_df.copy().fillna(''),
# Telco_customer_churn_services_df_copy = Telco_customer_churn_services_df.copy().fillna(''),

# df_s = [
#     Telco_customer_churn_df_copy,
#     Telco_customer_churn_demographics_df_copy,
#     Telco_customer_churn_location_df_copy,
#     Telco_customer_churn_population_df_copy,
#     Telco_customer_churn_services_df_copy,
#     Telco_customer_churn_services_df_copy
# ]
# os.makedirs("./df_info", exist_ok=True)
# count = 0
# for df in enumerate(df_s):
#     file_path = f'./df_info/{count}.txt'
#     count+=1
#     info_str = df.info(buf=None)
#     with open(file_path, "w") as f:
#         f.write(info_str)

In [None]:
# The 5 df's (subset of telco_churn) demographics/location etc. are to be dealt with.
categorical_data

#### Categorical Data

In [None]:
# Custom plot
fig, axes = plt.subplots(9,2, figsize=(15,40))
axes = axes.flatten()
for i in range(len(categorical_data)):
    features = categorical_data[i]
    sns.countplot(x=features, data=Telco_customer_churn_df, palette = 'Set2', ax=axes[i], hue='Churn Value')

plt.tight_layout()
plt.show()

#### Continous Data

In [None]:
continous_data = ['Monthly Charges', 'Churn Score', 'CLTV']
fig, axes = plt.subplots(3,1, figsize=(10,15))
axes = axes.flatten()
for i in range(len(continous_data)):
    features = continous_data[i]
    sns.histplot(x=features, data=Telco_customer_churn_df, ax=axes[i], palette='Paired', hue='Churn Value')
plt.tight_layout()
plt.show()

In [None]:
continous_data = ['Tenure in Months', 'Total Revenue', 'Total Charges']
fig, axes = plt.subplots(3,1, figsize=(10,15))
axes = axes.flatten()
for i in range(len(continous_data)):
    features = continous_data[i]
    sns.histplot(x=features, data=Telco_customer_churn_services_df, ax=axes[i])
plt.tight_layout()
plt.show()

#### Number of Dependents

In [None]:
plt.figure(figsize=(8,10))
sns.violinplot(Telco_customer_churn_demographics_df, y='Number of Dependents', color='m')
plt.xlabel("Number of Dependents")
plt.ylabel("Count")
plt.title("Number of Dependents Count")
plt.show()

In [None]:
# Distribution of People around the city.
fig = px.scatter_mapbox(
    Telco_customer_churn_df,
    lat = 'Latitude',
    lon = 'Longitude',
    color = 'Churn Score',
    hover_name = 'Churn Value',
    # size = 'Churn Value',
    zoom = 5, 
    height = 800,
    width = 800
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()

In [None]:
# Reason for churn -> most imp to analyze
plt.figure(figsize=(25,40))
sns.countplot(y='Churn Reason', data=Telco_customer_churn_df)
# plt.legend()
plt.title("Churn Reasons")
sns.set(font_scale=5)
# plt.tight_layout()
plt.show()

## Feature Engineering

In [None]:
df = Telco_customer_churn_df.copy()
df.drop(['CustomerID', 'Lat Long', 'Churn Reason', 'Country', 'State', 'City', 'Zip Code', 'Churn Label', 'Count', 'Churn Score'], axis=1, inplace=True)

In [None]:
# one_hot_encoder = OneHotEncoder()
# categorical_features = [
#     'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines',
#     'Internet Service','Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 
#     'Streaming TV', 'Streaming Movies', 'Contract','Paperless Billing', 'Payment Method', 'Churn Value'
# ]
df.columns



In [None]:
df.info()

In [None]:
dummy_cat_features = [
    'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines',
    'Internet Service','Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 
    'Streaming TV', 'Streaming Movies', 'Contract','Paperless Billing', 'Payment Method'
]
ohe = OneHotEncoder(sparse=False)
encoded_mat = ohe.fit_transform(df[dummy_cat_features])
df_new = pd.DataFrame(encoded_mat, columns=ohe.get_feature_names_out(dummy_cat_features))
df_new.info()
# import pickle
# with open("../artifacts/transformer/preprocessor.pkl", "rb") as file:
#         preprocessor = pickle.load(file)

# transformed_data = preprocessor.transform(df)
# df_n = pd.DataFrame(transformed_data, columns=df.columns)
# df_n

### Correlation Heatmap{Matrix}

In [None]:
df.info()

In [None]:
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
df.info()

In [None]:
corr_matrix = df.corr()
corr_matrix["Churn Value"].sort_values(ascending=False).to_frame()

In [None]:
plt.figure(figsize = (5,5))
sns.set(font_scale=1)
sns.heatmap(corr_matrix["Churn Value"].sort_values(ascending=False).to_frame(), annot = True,linewidths = 0.4,linecolor = 'black')

Chi SQuare Test for Feature Selection

In [None]:
chi_cat_features = df.loc[:,categorical_features]
chi_target = df.loc[:,'Churn Value']
best_features_chi = SelectKBest(score_func=chi2, k='all')
features_fit_chi = best_features_chi.fit(chi_cat_features, chi_target)
features_score_chi = pd.DataFrame(data=features_fit_chi.scores_, index=list(chi_cat_features.columns), columns=['Chi Squared Score'])
features_score_chi

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(features_score_chi.sort_values(ascending = False,by = 'Chi Squared Score'), annot=True, fmt = '.2f')
plt.title("Chi Square Test for Ctaegorical Feature Selection")
plt.show()

In [None]:
dummy_cat_features = [
    'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines',
    'Internet Service','Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 
    'Streaming TV', 'Streaming Movies', 'Contract','Paperless Billing', 'Payment Method'
]
df = pd.get_dummies(df, columns=dummy_cat_features, drop_first=True)
df.head()

In [None]:
df.info()

### Data Preparation

In [None]:
X = df.drop(['Churn Value'], axis=1).copy()
Y = df['Churn Value']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

## Model Training

### Model Preparation

In [None]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Gaussian Naive Bayes' : GaussianNB(),
    'K Nearest Neighbors' : KNeighborsClassifier(),
    'Support Vector Machine' : SVC(probability=True),
    'Decision Tree Classifier' : DecisionTreeClassifier(),
    'Random Forest Classifier' : RandomForestClassifier(),
    'Bagging Classifier' : BaggingClassifier(
        base_estimator=RandomForestClassifier(),
        n_estimators=10
    ),
    'Gradient Boosting Classifier' : GradientBoostingClassifier(),
    'AdaBoost' : AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(),
        n_estimators=50
    ),
    'Stacking Classifier' : StackingClassifier(
        estimators=[
            ('log_reg', LogisticRegression()),
            ('random_forest', RandomForestClassifier()),
            ('grad_boost', GradientBoostingClassifier())
        ]
    ),
    'Voting Classifier' : VotingClassifier(
        estimators=[
            ('log_reg', LogisticRegression()),
            ('random_forest', RandomForestClassifier()),
            ('grad_boost', GradientBoostingClassifier())
        ]
    ),
    'XgBoost' : XGBClassifier(),
    'LightGBM' : LGBMClassifier(),
    'Catboost' : CatBoostClassifier()
    }
model_accs = []
model_precs = []
model_recalls = []
model_f1s = []


# Models used for Training
'''
    1) RandomForestClassifier(n_estimators=100, class_weight={0:1,1:3})
    2) XGBClassifier()
    3) LGBMClassifier(learning_rate=0.09,max_depth=-5,scale_pos_weight =3,
                    random_state=42, objective = 'binary')
    4) Bagging Classifier
'''

### Training

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
for model_name, model in models.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    print(f"Model : {model_name}")
    print(f"Accuracy : {accuracy}")
    print(classification_report(Y_test, Y_pred))
    print()

In [None]:
# hyperparams training
param_grids = {
    'Logistic Regression': {'C': [0.1, 1, 10],'max_iter':[1000, 10000]},
    'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]},
    'Gradient Boosting': {'learning_rate': [0.1, 0.01], 'n_estimators': [100, 200, 300]},
    'Support Vector Machine': {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01]},
    'Stacking': {},  # Stacking doesn't have hyperparameters to tune
    'Bagging': {'n_estimators': [10, 20, 30]},
    'AdaBoost': {'n_estimators': [50, 100, 150]},
    'Voting': {}  # Voting doesn't have hyperparameters to tune
}

#### Random Forest Classifier

In [None]:
random_forest_clf = RandomForestClassifier(n_estimators=100, class_weight={0:1,1:3})
random_forest_clf.fit(X_train, Y_train)
Y_pred = random_forest_clf.predict(X_test)
models.append("Random Forest Classifier")
model_accs.append(accuracy_score(Y_test, Y_pred))
model_precs.append(precision_score(Y_test, Y_pred))
model_recalls.append(recall_score(Y_test, Y_pred))
model_f1s.append(f1_score(Y_test, Y_pred))

In [None]:
# Let's print the classification report

print(f"Accuracy score : {accuracy_score(Y_test, Y_pred)}")
print(f"Precision score : {precision_score(Y_test, Y_pred)}")
print(f"Recall score : {recall_score(Y_test, Y_pred)}")
print(f"F1 score : {f1_score(Y_test, Y_pred)}")
print(' ')
print(classification_report(Y_test, Y_pred))

In [None]:
# Let's print the confusion matrix
cm = confusion_matrix(Y_test, Y_pred)
cm_disp = ConfusionMatrixDisplay(cm, display_labels=['No Churn', 'Churn'])
cm_disp.plot()
plt.grid(False)
plt.show()

#### XGBoost

In [None]:
xgboost = XGBClassifier()
xgboost.fit(X_train, Y_train)
Y_pred = xgboost.predict(X_test)
models.append("XGBoost")
model_accs.append(accuracy_score(Y_test, Y_pred))
model_precs.append(precision_score(Y_test, Y_pred))
model_recalls.append(recall_score(Y_test, Y_pred))
model_f1s.append(f1_score(Y_test, Y_pred))

In [None]:
# Let's print the classification report

print(f"Accuracy score : {accuracy_score(Y_test, Y_pred)}")
print(f"Precision score : {precision_score(Y_test, Y_pred)}")
print(f"Recall score : {recall_score(Y_test, Y_pred)}")
print(f"F1 score : {f1_score(Y_test, Y_pred)}")
print(' ')
print(classification_report(Y_test, Y_pred))

In [None]:
# Let's print the confusion matrix

cm = confusion_matrix(Y_test, Y_pred)
cm_disp = ConfusionMatrixDisplay(cm, display_labels=['No Churn', 'Churn'])
cm_disp.plot()
plt.axis(False)
plt.show()

#### Light GBM

In [None]:
lgbm = LGBMClassifier(learning_rate=0.09,max_depth=-5,scale_pos_weight =3, random_state=42, objective = 'binary')
lgbm.fit(X_train, Y_train)
Y_pred = lgbm.predict(X_test)
models.append("Light GBM")
model_accs.append(accuracy_score(Y_test, Y_pred))
model_precs.append(precision_score(Y_test, Y_pred))
model_recalls.append(recall_score(Y_test, Y_pred))
model_f1s.append(f1_score(Y_test, Y_pred))

In [None]:
# Let's print the classification report

print(f"Accuracy score : {accuracy_score(Y_test, Y_pred)}")
print(f"Precision score : {precision_score(Y_test, Y_pred)}")
print(f"Recall score : {recall_score(Y_test, Y_pred)}")
print(f"F1 score : {f1_score(Y_test, Y_pred)}")
print(' ')
print(classification_report(Y_test, Y_pred))

In [None]:
# Let's print the confusion matrix

cm = confusion_matrix(Y_test, Y_pred)
cm_disp = ConfusionMatrixDisplay(cm, display_labels=['No Churn', 'Churn'])
cm_disp.plot()
plt.grid(False)
plt.show()

#### Bagging Classifier

In [None]:
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=123)
bagging_clf.fit(X_train, Y_train)
Y_pred = bagging_clf.predict(X_test)

models.append("Bagging Classifier")
model_accs.append(accuracy_score(Y_test, Y_pred))
model_precs.append(precision_score(Y_test, Y_pred))
model_recalls.append(recall_score(Y_test, Y_pred))
model_f1s.append(f1_score(Y_test, Y_pred))

In [None]:
# Let's print the classification report

print(f"Accuracy score : {accuracy_score(Y_test, Y_pred)}")
print(f"Precision score : {precision_score(Y_test, Y_pred)}")
print(f"Recall score : {recall_score(Y_test, Y_pred)}")
print(f"F1 score : {f1_score(Y_test, Y_pred)}")
print(' ')
print(classification_report(Y_test, Y_pred))

In [None]:
# Let's print the confusion matrix

cm = confusion_matrix(Y_test, Y_pred)
cm_disp = ConfusionMatrixDisplay(cm, display_labels=['No Churn', 'Churn'])
cm_disp.plot()
plt.grid(False)
plt.show()

### Evaluating Model Performances

In [None]:
model_dict = {
    'Model': models,
    'Precision': model_precs,
    'Acuuracy': model_accs,
    'Recall': model_recalls,
    'F1': model_f1s
}

models_df = pd.DataFrame(model_dict)

In [None]:
models_df