In [16]:
import pandas as pd
import os
import reportlab
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [2]:
df = pd.read_excel("../artifacts/data/Telco_customer_churn.xlsx")

In [3]:
categorical_features = [
                'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines',
                'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
                'Streaming TV', 'Streaming Movies', 'Contract','Paperless Billing', 'Payment Method', 'Churn Value'
            ]

dummy_cat_features = [
                'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines',
                'Internet Service','Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 
                'Streaming TV', 'Streaming Movies', 'Contract','Paperless Billing', 'Payment Method'
            ]

drop_features = [
                'CustomerID', 'Lat Long', 'Churn Reason', 'Country', 'State',
                'City', 'Zip Code', 'Churn Label', 'Count'
            ]

In [5]:
df = df.drop(drop_features, axis=1)
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
# Dumping all null values
df.dropna(axis=0, inplace=True)
label_enc = LabelEncoder()
for col in categorical_features:
    df[col] = label_enc.fit_transform(df[col])

In [6]:
data_new_f = pd.get_dummies(df, columns=dummy_cat_features, drop_first=True)

In [8]:
data_new_f.head()

Unnamed: 0,Latitude,Longitude,Tenure Months,Monthly Charges,Total Charges,Churn Value,Churn Score,CLTV,Gender_1,Senior Citizen_1,...,Streaming TV_1,Streaming TV_2,Streaming Movies_1,Streaming Movies_2,Contract_1,Contract_2,Paperless Billing_1,Payment Method_1,Payment Method_2,Payment Method_3
0,33.964131,-118.272783,2,53.85,108.15,1,86,3239,True,False,...,False,False,False,False,False,False,True,False,False,True
1,34.059281,-118.30742,2,70.7,151.65,1,67,2701,False,False,...,False,False,False,False,False,False,True,False,True,False
2,34.048013,-118.293953,8,99.65,820.5,1,86,5372,False,False,...,False,True,False,True,False,False,True,False,True,False
3,34.062125,-118.315709,28,104.8,3046.05,1,84,5003,False,False,...,False,True,False,True,False,False,True,False,True,False
4,34.039224,-118.266293,49,103.7,5036.3,1,89,5340,True,False,...,False,True,False,True,False,False,True,False,False,False


In [10]:
X = data_new_f.drop(['Churn Value'], axis=1)
Y = data_new_f['Churn Value']
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [12]:
models = {
    'Logistic Regression' : LogisticRegression(),
}
params = {
    'Logistic Regression' : {
        'penalty' : ['l2', 'elasticnet'],
        'C' : [0.1, 1, 10],
        'max_iter' : [100, 1000, 10000]
    },
}

In [17]:
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
from sklearn.model_selection import GridSearchCV
from reportlab.pdfgen import canvas

In [27]:
os.makedirs(os.path.join("artifacts", "performance"), exist_ok=True)
classification_reports = {}
accuracies = {}

for model_name, model in models.items():
    if model_name in params:
        hyper_parameters = params[model_name]
        grid_search = GridSearchCV(model, hyper_parameters, cv=5)
        grid_search.fit(X_train, Y_train)
        best_model = grid_search.best_estimator_
        Y_pred = best_model.predict(X_val)
        accuracy = accuracy_score(Y_val, Y_pred)
        class_report = classification_report(Y_val, Y_pred, output_dict=True)
        # classification_reports[model_name] = class_report
        class_report_df = pd.DataFrame(class_report).transpose()
        print(class_report_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

              precision    recall  f1-score      support
0              0.937255  0.944664  0.940945  1012.000000
1              0.855297  0.837975  0.846547   395.000000
accuracy       0.914712  0.914712  0.914712     0.914712
macro avg      0.896276  0.891319  0.893746  1407.000000
weighted avg   0.914246  0.914712  0.914444  1407.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
import warnings
warnings.filterwarnings('ignore')
for model_name, model in models.items():
    if model_name in params:
        hyper_parameters = params[model_name]
        grid_search = GridSearchCV(model, hyper_parameters, cv=5)
        grid_search.fit(X_train, Y_train)
        best_model = grid_search.best_estimator_
        Y_pred = best_model.predict(X_val)
        accuracy = accuracy_score(Y_val, Y_pred)
        class_report = classification_report(Y_val, Y_pred)
        # classification_reports[model_name] = class_report
        print(class_report)

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1012
           1       0.86      0.84      0.85       395

    accuracy                           0.91      1407
   macro avg       0.90      0.89      0.89      1407
weighted avg       0.91      0.91      0.91      1407

