## Imports

In [None]:
# Basic
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

# MlFlow
import mlflow
import mlflow.sklearn

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV

# Model Abstraction
import model_functions as ModelFunctions
from model_functions import PARAM_GRID_RF, PARAM_GRID_GB, PARAM_GRID_KNN

# Warnings
import warnings
warnings.filterwarnings("ignore")

# Path
_ = load_dotenv()
FULL_PATH = os.environ.get('FULL_PATH')
PATH = os.path.join(FULL_PATH, 'datasets')

## Datasets

In [44]:
path_raw = os.path.join(PATH, 'telecom_customer_churn.csv')
path_model = os.path.join(PATH, 'churn_model_dataset.csv')
path_model_filtered = os.path.join(PATH, 'churn_model_filtered.csv')

df_raw_churn = pd.read_csv(path_raw)
df_churn_model = pd.read_csv(path_model)
df_churn_model_filtered = pd.read_csv(path_model_filtered)

### Dataset Original

In [45]:
df_raw_churn.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [46]:
df_raw_churn.columns.tolist()

['Customer ID',
 'Gender',
 'Age',
 'Married',
 'Number of Dependents',
 'City',
 'Zip Code',
 'Latitude',
 'Longitude',
 'Number of Referrals',
 'Tenure in Months',
 'Offer',
 'Phone Service',
 'Avg Monthly Long Distance Charges',
 'Multiple Lines',
 'Internet Service',
 'Internet Type',
 'Avg Monthly GB Download',
 'Online Security',
 'Online Backup',
 'Device Protection Plan',
 'Premium Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Streaming Music',
 'Unlimited Data',
 'Contract',
 'Paperless Billing',
 'Payment Method',
 'Monthly Charge',
 'Total Charges',
 'Total Refunds',
 'Total Extra Data Charges',
 'Total Long Distance Charges',
 'Total Revenue',
 'Customer Status',
 'Churn Category',
 'Churn Reason']

In [47]:
df_raw_churn.shape

(7043, 38)

### Dataset Tratado

In [48]:
df_churn_model.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,...,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,0002-ORFBO,1,2,1,0,2,9,0,1,42.39,...,3,1,2,65.6,593.3,0.0,0,381.51,974.81,1
1,0003-MKNFE,0,3,0,0,0,9,0,1,10.69,...,1,0,2,-4.0,542.4,38.33,10,96.21,610.28,1
2,0004-TLHLJ,0,3,0,0,0,4,5,1,33.65,...,1,1,1,73.9,280.85,0.0,0,134.6,415.45,0
3,0011-IGKFF,0,5,1,0,1,13,4,1,27.82,...,1,1,1,98.0,1237.85,0.0,0,361.66,1599.51,0
4,0013-EXCHZ,1,4,1,0,3,3,0,1,7.38,...,1,1,2,83.9,267.4,0.0,0,22.14,289.54,0


In [49]:
df_churn_model.columns.tolist()

['Customer ID',
 'Gender',
 'Age',
 'Married',
 'Number of Dependents',
 'Number of Referrals',
 'Tenure in Months',
 'Offer',
 'Phone Service',
 'Avg Monthly Long Distance Charges',
 'Multiple Lines',
 'Internet Service',
 'Internet Type',
 'Avg Monthly GB Download',
 'Online Security',
 'Online Backup',
 'Device Protection Plan',
 'Premium Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Streaming Music',
 'Unlimited Data',
 'Contract',
 'Paperless Billing',
 'Payment Method',
 'Monthly Charge',
 'Total Charges',
 'Total Refunds',
 'Total Extra Data Charges',
 'Total Long Distance Charges',
 'Total Revenue',
 'Customer Status']

In [50]:
df_churn_model.shape

(7043, 32)

### Dataset Filtrado

In [51]:
df_churn_model_filtered.head()

Unnamed: 0,Customer ID,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Internet Service,...,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,0002-ORFBO,2,1,0,2,9,0,1,42.39,1,...,3,1,2,65.6,593.3,0.0,0,381.51,974.81,1
1,0003-MKNFE,3,0,0,0,9,0,1,10.69,1,...,1,0,2,-4.0,542.4,38.33,10,96.21,610.28,1
2,0004-TLHLJ,3,0,0,0,4,5,1,33.65,1,...,1,1,1,73.9,280.85,0.0,0,134.6,415.45,0
3,0011-IGKFF,5,1,0,1,13,4,1,27.82,1,...,1,1,1,98.0,1237.85,0.0,0,361.66,1599.51,0
4,0013-EXCHZ,4,1,0,3,3,0,1,7.38,1,...,1,1,2,83.9,267.4,0.0,0,22.14,289.54,0


In [52]:
df_churn_model_filtered.columns.tolist()

['Customer ID',
 'Age',
 'Married',
 'Number of Dependents',
 'Number of Referrals',
 'Tenure in Months',
 'Offer',
 'Phone Service',
 'Avg Monthly Long Distance Charges',
 'Internet Service',
 'Internet Type',
 'Avg Monthly GB Download',
 'Online Security',
 'Premium Tech Support',
 'Unlimited Data',
 'Contract',
 'Paperless Billing',
 'Payment Method',
 'Monthly Charge',
 'Total Charges',
 'Total Refunds',
 'Total Extra Data Charges',
 'Total Long Distance Charges',
 'Total Revenue',
 'Customer Status']

In [53]:
df_churn_model_filtered.shape

(7043, 25)

## Modelos


### Dataset Tratado

In [54]:
df_churn_model = df_churn_model.drop(columns=['Customer ID'])

ModelFunctions.mlflow_up('Modelos churn_model_dataset')

X_train, X_test, y_train, y_test = ModelFunctions.model_train_test(df_churn_model, 'Customer Status')

In [55]:
ModelFunctions.rand_search_cv(RandomForestClassifier(), PARAM_GRID_RF, X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


2024/11/20 19:35:46 INFO mlflow.tracking._tracking_service.client: 游끢 View run RandomSearchCV_RandomForestClassifier at: http://127.0.0.1:5000/#/experiments/784384534159745577/runs/35e662e76b4f4d2da541ad234ef9d440.
2024/11/20 19:35:46 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5000/#/experiments/784384534159745577.


Melhores par칙metros: {'n_estimators': 100, 'max_depth': 10, 'criterion': 'gini'}
Precis칚o (acur치cia): 0.8357785139611926


In [56]:
ModelFunctions.grid_search_cv(RandomForestClassifier(), PARAM_GRID_RF, X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


2024/11/20 19:36:35 INFO mlflow.tracking._tracking_service.client: 游끢 View run GridSearchCVRandomForestClassifier at: http://127.0.0.1:5000/#/experiments/784384534159745577/runs/11a6923cb5914ea08683d60758aca50c.
2024/11/20 19:36:35 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5000/#/experiments/784384534159745577.


Melhores par칙metros: {'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}
Precis칚o (acur치cia): 0.8324656885944155


In [57]:
ModelFunctions.bayesian_search_cv(RandomForestClassifier(), PARAM_GRID_RF, X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

2024/11/20 19:50:57 INFO mlflow.tracking._tracking_service.client: 游끢 View run BayesSearchCVRandomForestClassifier at: http://127.0.0.1:5000/#/experiments/784384534159745577/runs/79fe51a84a00434aaf9efb44eaac490e.
2024/11/20 19:50:57 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5000/#/experiments/784384534159745577.


Melhores par칙metros: OrderedDict({'criterion': 'entropy', 'max_depth': None, 'n_estimators': 50})
Precis칚o (acur치cia): 0.8277330809275911


### Dataset Filtrado


In [58]:
df_churn_model_filtered = df_churn_model_filtered.drop(columns=['Customer ID'])

ModelFunctions.mlflow_up('Modelos churn_filtered_dataset')

X_train, X_test, y_train, y_test = ModelFunctions.model_train_test(df_churn_model_filtered, 'Customer Status')

In [59]:
ModelFunctions.rand_search_cv(RandomForestClassifier(), PARAM_GRID_RF, X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


2024/11/20 19:51:19 INFO mlflow.tracking._tracking_service.client: 游끢 View run RandomSearchCV_RandomForestClassifier at: http://127.0.0.1:5000/#/experiments/358795740332304726/runs/aaa83579297447e29c38c6da82cb4390.
2024/11/20 19:51:19 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5000/#/experiments/358795740332304726.


Melhores par칙metros: {'n_estimators': 50, 'max_depth': 10, 'criterion': 'entropy'}
Precis칚o (acur치cia): 0.8343587316611453


In [60]:
ModelFunctions.grid_search_cv(RandomForestClassifier(), PARAM_GRID_RF, X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


2024/11/20 19:52:02 INFO mlflow.tracking._tracking_service.client: 游끢 View run GridSearchCVRandomForestClassifier at: http://127.0.0.1:5000/#/experiments/358795740332304726/runs/d5ef696b142749fa8c91b2a9011d355e.
2024/11/20 19:52:02 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5000/#/experiments/358795740332304726.


Melhores par칙metros: {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100}
Precis칚o (acur치cia): 0.83719829626124


In [61]:
ModelFunctions.bayesian_search_cv(RandomForestClassifier(), PARAM_GRID_RF, X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

2024/11/20 20:05:54 INFO mlflow.tracking._tracking_service.client: 游끢 View run BayesSearchCVRandomForestClassifier at: http://127.0.0.1:5000/#/experiments/358795740332304726/runs/dd5a48eaa3144f2b8dc86e6f19537b2d.
2024/11/20 20:05:54 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5000/#/experiments/358795740332304726.


Melhores par칙metros: OrderedDict({'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100})
Precis칚o (acur치cia): 0.8381448177946048
