# Telco Customer Churn Prediction

## Configuration and Setup

In [None]:
# Importing libs
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    GradientBoostingClassifier,
    AdaBoostClassifier,
    StackingClassifier,
    VotingClassifier
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    f1_score,
    ConfusionMatrixDisplay
)
from sklearn_evaluation import plot
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import Gradio

## Get the data

In [None]:
data_path = "../artifacts/data"
files = os.listdir(data_path)
files
for file in files:
    file_name = os.path.splitext(file)[0]
    globals()[f'{file_name}_df'] = pd.read_excel(os.path.join(data_path, file))

df_s = [
    
    Telco_customer_churn_df,

    Telco_customer_churn_demographics_df,

    Telco_customer_churn_location_df,
    
    Telco_customer_churn_population_df,
    
    Telco_customer_churn_services_df,
    
    Telco_customer_churn_services_df,
    
    CustomerChurn_df
]

## Data Analysis and EDA/Visualization

In [None]:
Telco_customer_churn_df.head()

In [None]:
Telco_customer_churn_df.info()

In [None]:
Telco_customer_churn_df.columns

In [None]:
# Investigating all categorical values in the dataset
categorical_data = []
for column in Telco_customer_churn_df:
    value_counts = pd.value_counts(Telco_customer_churn_df[column])
    value_counts_len = len(value_counts.index.to_list())

    if value_counts_len < 10 and value_counts_len!=1:
        categorical_data.append(column)
        print(f"Unique values for column '{column}' : {value_counts.index.to_list()} having len {value_counts_len}")

In [None]:
# The 5 df's (subset of telco_churn) demographics/location etc. are to be dealt with.
categorical_data

#### Categorical Data

In [None]:
# Custom plot
fig, axes = plt.subplots(9,2, figsize=(15,40))
axes = axes.flatten()
for i in range(len(categorical_data)):
    features = categorical_data[i]
    sns.countplot(x=features, data=Telco_customer_churn_df, palette = 'Set2', ax=axes[i], hue='Churn Value')

plt.tight_layout()
plt.show()

#### Continous Data

In [None]:
continous_data = ['Monthly Charges', 'Churn Score', 'CLTV']
fig, axes = plt.subplots(3,1, figsize=(10,15))
axes = axes.flatten()
for i in range(len(continous_data)):
    features = continous_data[i]
    sns.histplot(x=features, data=Telco_customer_churn_df, ax=axes[i], palette='Paired', hue='Churn Value')
plt.tight_layout()
plt.show()

In [None]:
continous_data = ['Tenure in Months', 'Total Revenue', 'Total Charges']
fig, axes = plt.subplots(3,1, figsize=(10,15))
axes = axes.flatten()
for i in range(len(continous_data)):
    features = continous_data[i]
    sns.histplot(x=features, data=Telco_customer_churn_services_df, ax=axes[i])
plt.tight_layout()
plt.show()

#### Number of Dependents

In [None]:
plt.figure(figsize=(8,10))
sns.violinplot(Telco_customer_churn_demographics_df, y='Number of Dependents', color='m')
plt.xlabel("Number of Dependents")
plt.ylabel("Count")
plt.title("Number of Dependents Count")
plt.show()

In [None]:
# Distribution of People around the city.
fig = px.scatter_mapbox(
    Telco_customer_churn_df,
    lat = 'Latitude',
    lon = 'Longitude',
    color = 'Churn Score',
    hover_name = 'Churn Value',
    # size = 'Churn Value',
    zoom = 5, 
    height = 800,
    width = 800
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()

In [None]:
# Reason for churn -> most imp to analyze
plt.figure(figsize=(25,40))
sns.countplot(y='Churn Reason', data=Telco_customer_churn_df)
# plt.legend()
plt.title("Churn Reasons")
sns.set(font_scale=5)
# plt.tight_layout()
plt.show()

## Feature Engineering

In [None]:
df = Telco_customer_churn_df.copy()
df.drop(['CustomerID', 'Lat Long', 'Churn Reason', 'Country', 'State', 'City', 'Zip Code', 'Churn Label', 'Count', 'Churn Score'], axis=1, inplace=True)

In [None]:
dummy_cat_features = [
    'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines',
    'Internet Service','Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 
    'Streaming TV', 'Streaming Movies', 'Contract','Paperless Billing', 'Payment Method'
]
ohe = OneHotEncoder(sparse=False)
encoded_mat = ohe.fit_transform(df[dummy_cat_features])
df_new = pd.DataFrame(encoded_mat, columns=ohe.get_feature_names_out(dummy_cat_features))
df_new.info()

### Correlation Heatmap{Matrix}

In [None]:
df.info()

In [None]:
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
df.info()

In [None]:
corr_matrix = df.corr()
corr_matrix["Churn Value"].sort_values(ascending=False).to_frame()

In [None]:
plt.figure(figsize = (5,5))
sns.set(font_scale=1)
sns.heatmap(corr_matrix["Churn Value"].sort_values(ascending=False).to_frame(), annot = True,linewidths = 0.4,linecolor = 'black')

### Data Preparation

In [None]:
X = df.drop(['Churn Value'], axis=1).copy()
Y = df['Churn Value']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

## Model Training

### Model Preparation

In [None]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Gaussian Naive Bayes' : GaussianNB(),
    'K Nearest Neighbors' : KNeighborsClassifier(),
    'Support Vector Machine' : SVC(probability=True),
    'Decision Tree Classifier' : DecisionTreeClassifier(),
    'Random Forest Classifier' : RandomForestClassifier(),
    'Bagging Classifier' : BaggingClassifier(
        base_estimator=RandomForestClassifier(),
        n_estimators=10
    ),
    'Gradient Boosting Classifier' : GradientBoostingClassifier(),
    'AdaBoost' : AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(),
        n_estimators=50
    ),
    'Stacking Classifier' : StackingClassifier(
        estimators=[
            ('log_reg', LogisticRegression()),
            ('random_forest', RandomForestClassifier()),
            ('grad_boost', GradientBoostingClassifier())
        ]
    ),
    'Voting Classifier' : VotingClassifier(
        estimators=[
            ('log_reg', LogisticRegression()),
            ('random_forest', RandomForestClassifier()),
            ('grad_boost', GradientBoostingClassifier())
        ]
    ),
    'XgBoost' : XGBClassifier(),
    'LightGBM' : LGBMClassifier(
        scale_pos_weight =3,
        random_state=42,
        objective = 'binary'
    ),
    'Catboost' : CatBoostClassifier()
    }
model_accs = []
model_precs = []
model_recalls = []
model_f1s = []


# Models used for Training

In [None]:
param_grid = {
    'Logistic Regression' : {
        'penalty' : ['l2', 'elasticnet'],
        'C' : [0.1, 1, 10],
        'max_iter' : [100, 1000, 10000]
    },
    'Gaussian Naive Bayes' : {}, # No hyperparameters to tune
    'K Nearest Neighbors' : {
        'n_neighbors' : [1, 5, 10],
        'weights' : ['uniform', 'distance']
    },
    'Support Vector Machine' : {
        'C' : [0.1, 1],
        'kernel' : ['rbf'],
        'gamma' : [0.1, 0.01]
    },
    'Decision Tree Classifier' : {
        'criterion' : ['gini', 'log_loss'],
        'max_depth' : [None, 10, 100]
    },
    'Random Forest Classifier' : {
        'criterion' : ['gini', 'log_loss'],
        'max_depth' : [None, 10, 100],
        'n_estimators' : [100, 200, 300]
    },
    'Bagging Classifier' : {
        'n_estimators' : [10, 20, 30, 100]
    },
    'Gradient Boosting Classifier' : {
        'learning_rate' : [0.1, 1],
        'n_estimators' : [100, 200, 300]
    },
    'AdaBoost' : {
        'n_estimators' : [50, 100, 200, 300]
    },
    'Stacking Classifier' : {}, # No hyperparameter to tune
    'Voting Classifier' : {}, # No hyperparameter to tune
    'XgBoost' : {},
    'LightGBM' : {
        'learning_rate' : [0.1, 0.01],
        'max_depth' : [-5, -10, -20] 
    },
    'CatBoost' : {
        'depth' : [6, 8, 10],
        'learning_rate' : [0.01, 0.05, 0.1],
        'iterations' : [30, 50, 100]
    }

}

### Training and Evaluation

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
for model_name, model in models.items():
    if model_name in param_grid:
        hyper_parameters = param_grid[model_name]
        grid_search = GridSearchCV(model, hyper_parameters, cv=5)
        grid_search.fit(X_train, Y_train)
        best_model = grid_search.best_estimator_
        Y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)
        precision = precision_score(Y_test, Y_pred)
        recall = recall_score(Y_test, Y_pred)
        f1 = f1_score(Y_test, Y_pred)
        print(classification_report(Y_test, Y_pred))

        # Appending the lists with scores
        model_accs.append(accuracy_score)
        model_precs.append(precision)
        model_recalls.append(recall)
        model_f1s.append(f1)

        # confusion matrix could also be considered as done in the final project...

In [None]:
# Printing performance metrics

model_dict = {
    'Model': models,
    'Precision': model_precs,
    'Acuuracy': model_accs,
    'Recall': model_recalls,
    'F1': model_f1s
}

models_df = pd.DataFrame(model_dict)

In [None]:
models_df

In [None]:
def outputdeploy(input):
    output = model_var.predict(input)



gradio.Interface(outputdeploy, input)