In [1]:
import warnings
warnings.simplefilter('ignore')

import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix

### Import the csv Data as Pnadas dataframe

In [2]:
df = pd.read_csv("data/EDA.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


### Prepare the X and Y variables

In [3]:
x = df.drop(columns=['Churn Value','Churn Reason','State','Count','CustomerID','Churn Label', 'Unnamed: 0', 'Country', 'Lat Long', 'Zip Code', 'Latitude', 'Longitude'], axis=1)
y = df['Churn Value']
x.head()

Unnamed: 0,City,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,...,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Score,CLTV
0,Los Angeles,Male,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,86,3239
1,Los Angeles,Female,No,No,Yes,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,67,2701
2,Los Angeles,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,...,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,86,5372
3,Los Angeles,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,84,5003
4,Los Angeles,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,...,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,89,5340


In [4]:
# from imblearn.combine import SMOTEENN
# sme = SMOTEENN(random_state=42)
# x_res, y_res = sme.fit_resample(x, y)

### Create group of the Numerical Features and Categorical Feature 

In [5]:
num_features = x.select_dtypes(exclude="O").columns
cat_features = x.select_dtypes(include="O").columns

In [6]:
num_features

Index(['Tenure Months', 'Monthly Charges', 'Total Charges', 'Churn Score',
       'CLTV'],
      dtype='object')

In [7]:
cat_features

Index(['City', 'Gender', 'Senior Citizen', 'Partner', 'Dependents',
       'Phone Service', 'Multiple Lines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
       'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing',
       'Payment Method'],
      dtype='object')

In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [9]:
for i in cat_features:
    print(f"Categories in '{i} variable : ", end=" ")
    print(df[i].unique())
    print("======================================================")

Categories in 'City variable :  ['Los Angeles' 'Beverly Hills' 'Huntington Park' ... 'Standish' 'Tulelake'
 'Olympic Valley']
Categories in 'Gender variable :  ['Male' 'Female']
Categories in 'Senior Citizen variable :  ['No' 'Yes']
Categories in 'Partner variable :  ['No' 'Yes']
Categories in 'Dependents variable :  ['No' 'Yes']
Categories in 'Phone Service variable :  ['Yes' 'No']
Categories in 'Multiple Lines variable :  ['No' 'Yes' 'No phone service']
Categories in 'Internet Service variable :  ['DSL' 'Fiber optic' 'No']
Categories in 'Online Security variable :  ['Yes' 'No' 'No internet service']
Categories in 'Online Backup variable :  ['Yes' 'No' 'No internet service']
Categories in 'Device Protection variable :  ['No' 'Yes' 'No internet service']
Categories in 'Tech Support variable :  ['No' 'Yes' 'No internet service']
Categories in 'Streaming TV variable :  ['No' 'Yes' 'No internet service']
Categories in 'Streaming Movies variable :  ['No' 'Yes' 'No internet service']
Catego

In [10]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(sparse=False, drop='first', dtype=np.int16)

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", cat_transformer, cat_features),
        ("StandardScaler", num_transformer, num_features)
    ]
)

In [11]:
x = preprocessor.fit_transform(x)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((5634, 1160), (1409, 1160), (5634,), (1409,))

In [13]:
from imblearn.combine import SMOTEENN
sme = SMOTEENN(random_state=42)
x_res, y_res = sme.fit_resample(x_train, y_train)

### Create the Evaluate Model

In [14]:
def evaluate_model(true, predicted):
    print("accuracy_score")
    accuracy_score1 = accuracy_score(true, predicted)
    print("classification_report")
    classification_report1 = classification_report(true, predicted)
    print("confusion_matrix")
    confusion_matrix1 = confusion_matrix(true, predicted)
    return accuracy_score1, classification_report1, confusion_matrix1

In [15]:
models = {
    "LogisticRegression":LogisticRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNeighborsClassifier":KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "Random Forest Classifier":RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(),
    "CatBoostClassifier":CatBoostClassifier(verbose=False),
    "SVC": SVC(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),

    
}

In [16]:
model_list = []
acuracy_score_list = []
class_report = []
matrics = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_res, y_res)

    y_pred = model.predict(x_test)

    model_accuracy, model_classification_report, model_confusion_matrix = evaluate_model(y_test, y_pred.round())
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance for Testing set")
    print("Model accuarcy : {}".format(model_accuracy))
    print("="*50)
    print("classification_report :")
    print(model_classification_report)
    print("="*50)
    print("Confusion Matrix")
    print(model_confusion_matrix)

    acuracy_score_list.append(model_accuracy)
    class_report.append(model_classification_report)
    matrics.append(model_confusion_matrix)
    print("*"*35)
    print()

accuracy_score
classification_report
confusion_matrix
LogisticRegression
Model Performance for Testing set
Model accuarcy : 0.8637331440738112
classification_report :
              precision    recall  f1-score   support

           0       0.99      0.82      0.90      1009
           1       0.68      0.97      0.80       400

    accuracy                           0.86      1409
   macro avg       0.83      0.90      0.85      1409
weighted avg       0.90      0.86      0.87      1409

Confusion Matrix
[[828 181]
 [ 11 389]]
***********************************

accuracy_score
classification_report
confusion_matrix
Lasso
Model Performance for Testing set
Model accuarcy : 0.28388928317955997
classification_report :
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1009
           1       0.28      1.00      0.44       400

    accuracy                           0.28      1409
   macro avg       0.14      0.50      0.22      1409
we

In [17]:
print("+++++++++++++++++++++++++++++++++++++")

+++++++++++++++++++++++++++++++++++++
