# Predicting Customer Churn in Telecom Operators


In [None]:
# Python Language Version
from platform import python_version
print('Python Language Version Used in This Jupyter Notebook:', python_version())

In [None]:
# Imports
#Libraries for saving the Model
import joblib
import pickle

#Data manipulation libraries
import numpy as np
import pandas as pd

#Data visualization libraries
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler

#Libraries for Machine Learning
import sklearn
from sklearn.model_selection import train_test_split # Split the dataset
from sklearn.model_selection import GridSearchCV # Hyper Parameter Optimization
from sklearn.model_selection import cross_val_score #Model Evaluation
from sklearn.model_selection import RandomizedSearchCV # Hyper Parameter Optimization

#Libraries with algorithms for Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

#Libraries calculate the model metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score

%matplotlib inline 
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Versions of packages used in this notebook jupyter
%reload_ext watermark
%watermark -a "Matheus Francelino Barbosa" --iversions

## Loading the Dataset

In [None]:
# Load the training data
dados_treino = pd.read_csv('dados/projeto4_telecom_treino.csv')

In [None]:
dados_treino.shape

In [None]:
# Load the test data
dados_teste = pd.read_csv('dados/projeto4_telecom_teste.csv')

In [None]:
dados_teste.shape

In [None]:
dados_treino.columns

In [None]:
dados_treino.sample(10)

In [None]:
#Checking Data Types
dados_treino.dtypes

In [None]:
# Checking only categorical variables
dados_treino.dtypes[dados_treino.dtypes == 'object']

In [None]:
# List of categorical columns
cats = ['state',
        'area_code', 
        'international_plan',
        'voice_mail_plan']

In [None]:
#Verifying only numerical variables
dados_treino.dtypes[dados_treino.dtypes != 'object']

In [None]:
# List of numerical columns
nums = ['account_length', 
        'number_vmail_messages', 
        'total_day_minutes', 
        'total_day_calls', 
        'total_day_charge', 
        'total_eve_minutes',
        'total_eve_calls',
        'total_eve_charge',
        'total_night_minutes',
        'total_night_calls',
        'total_night_charge',
        'total_intl_minutes',
        'total_intl_calls',
        'total_intl_charge',
        'number_customer_service_calls']

In [None]:
dados_treino['churn'].value_counts()

In [None]:
target = dados_treino['churn']

In [None]:
target.value_counts()

## Exploring the numerical data

In [None]:
dados_treino.describe()

In [None]:
# Plot 
dados_treino.hist(figsize = (15,15), bins = 10) 
plt.show()

Aparentemente as variaveis seguem uma distribuição normal, exceto pela variavel _number_vmail_messages_

In [None]:
#Rename the variable target
dados_treino.rename({'churn':'Target'}, axis = 'columns', inplace = True)

In [None]:
dados_treino.columns

In [None]:
# Function for label encoding
# Let's change 'no' to 0 and 'yes' to 1
def encoding_func(x):
    if x == 'no':
        return 0
    return 1

In [None]:
# Apply the function
dados_treino['Target'] = dados_treino['Target'].map(encoding_func)

In [None]:
dados_treino.sample(5)

In [None]:
dados_treino['Target'].value_counts()

In [None]:
target = dados_treino['Target']

In [None]:
# List of numerical columns
nums = ['account_length', 
        'number_vmail_messages', 
        'total_day_minutes', 
        'total_day_calls', 
        'total_day_charge', 
        'total_eve_minutes',
        'total_eve_calls',
        'total_eve_charge',
        'total_night_minutes',
        'total_night_calls',
        'total_night_charge',
        'total_intl_minutes',
        'total_intl_calls',
        'total_intl_charge',
        'number_customer_service_calls',
        'Target']

In [None]:
# Correlation between numerical variables
dados_treino.corr()

In [None]:
corr_df = dados_treino[nums].corr()

In [None]:
# Correlation (visual)
plt.figure(figsize = (14, 12))
sns.heatmap(corr_df, cmap = 'Blues', annot = True, fmt = '.2f') #cmap = 'Reds'

### Checking the relationship between attributes

In [None]:
# Set the background style
sns.set_style('darkgrid')  

# Facetgrid
sns.FacetGrid(dados_treino, hue = 'Target', size = 5).map(plt.scatter, 'total_day_minutes', 'total_day_charge').add_legend()

In [None]:
# Set the background style
sns.set_style('darkgrid')  

# Facetgrid
sns.FacetGrid(dados_treino, hue = 'Target', size = 5).map(plt.scatter, 'total_eve_minutes', 'total_eve_charge').add_legend()

In [None]:
# Set the background style
sns.set_style('darkgrid')  

# Facetgrid
sns.FacetGrid(dados_treino, hue = 'Target', size = 5).map(plt.scatter, 'total_night_minutes', 'total_night_charge').add_legend()

In [None]:
# Set the background style
sns.set_style('darkgrid')  

# Facetgrid
sns.FacetGrid(dados_treino, hue = 'Target', size = 5).map(plt.scatter, 'total_intl_minutes', 'total_intl_charge').add_legend()

***In order to avoid strong correlation between the attributes, we can remove the variables total_day_minutes, total_eve_minutes, total_night_minutes and total_intl_minutes***

## Exploring the categorical data

In [None]:
dados_treino.describe(include = ['object'])

In [None]:
plt.figure(figsize = (14, 12))
for i in range(0, len(cats)):
    plt.subplot(2, 2, i+1)
    sns.countplot(x = dados_treino[cats[i]], color = 'green', orient = 'v')
    plt.tight_layout()

## Applying transformations on categorical variables

In [None]:
# Function for label encoding for international_plan -> 0 = no and 1 = yes
# Apply function
dados_treino['international_plan'] = dados_treino['international_plan'].map(encoding_func)

In [None]:
# Function for label encoding for international_plna -> 0 = no and 1 = yes
# Apply the function
dados_treino['voice_mail_plan'] = dados_treino['voice_mail_plan'].map(encoding_func)

In [None]:
dados_treino.sample(5)

In [None]:
dados_treino.columns

In [None]:
# Checking only categorical variables
dados_treino.dtypes[dados_treino.dtypes == 'object']

In [None]:
# Checking only the non-categorical variables
dados_treino.dtypes[dados_treino.dtypes != 'object']

In [None]:
dados_treino['state'].value_counts()

In [None]:
#Applying one hot enconding to the area code variable

In [None]:
# Applying One-Hot Encoding
for cat in ['area_code']:
    onehots = pd.get_dummies(dados_treino[cat], prefix = cat)
    dados_treino = dados_treino.join(onehots)

In [None]:
dados_treino.columns


In [None]:
dados_treino.sample(5)

## Clearing the Data

First we will remove the states column and the area_code column because now we have the one-hot encode

In [None]:
dados_treino = dados_treino.drop(columns = ['Unnamed: 0', 
                        'state', 
                        'area_code'])

In [None]:
# Removing the total_day_minutes, total_eve_minutes, total_night_minutes and total_intl_minutes columns to avoid correlation
dados_treino = dados_treino.drop(columns = ['total_day_minutes', 
                                            'total_eve_minutes', 
                                            'total_night_minutes',
                                            'total_intl_minutes'])

In [None]:
dados_treino.sample(5)

In [None]:
#Rename the variable area_code
dados_treino.rename({'area_code_area_code_408':'area_code_408','area_code_area_code_415':'area_code_415','area_code_area_code_510':'area_code_510'}, axis = 'columns', inplace = True)

In [None]:
dados_treino.sample(5)

### Checking for null and duplicate values

In [None]:
#Null values
dados_treino[dados_treino.isnull().values]

In [None]:
#Duplicate values
dados_treino[dados_treino.duplicated(keep = False)]

## Checking Outliers

In [None]:
dados_treino.describe()

In [None]:
dados_treino.shape

In [None]:
dados_treino.columns

In [None]:
var_num = ['account_length',
       'number_vmail_messages', 'total_day_calls',
       'total_day_charge', 'total_eve_calls',
       'total_eve_charge', 'total_night_calls',
       'total_night_charge', 'total_intl_calls',
       'total_intl_charge', 'number_customer_service_calls']

In [None]:
var_num

In [None]:
# Plot 
dados_treino.hist(figsize = (15,15), bins = 10) 
plt.show()

In [None]:
plt.figure(figsize = (20, 40))

features = var_num
for i in range(0, len(features)):
    plt.subplot(5, int(len(features)/2), i + 1)
    sns.boxplot(y = dados_treino[features[i]], color = 'magenta', orient = 'v')
    #plt.tight_layout()

In [None]:
dados_treino.columns

In [None]:
# Boxplot
plt.figure(figsize = (10, 4))
sns.boxplot(dados_treino.account_length)

In [None]:
#Frequency Counting per Value
dados_treino.account_length.sort_values(ascending = False).head(10)


In [None]:
# Keep only those records where the value is less than or equal to 220
dados_treino = dados_treino[dados_treino.account_length <= 220]
dados_treino.shape

In [None]:
# Boxplot
plt.figure(figsize = (10, 4))
sns.boxplot(dados_treino.total_day_calls)

In [None]:
# Frequency count per value
dados_treino.total_day_calls.sort_values(ascending = True).head(10)

In [None]:
# Keep only those records where the value is greater than 40
dados_treino = dados_treino[dados_treino.total_day_calls >= 40]
dados_treino.shape

In [None]:
# Frequency count per value
dados_treino.total_day_calls.sort_values(ascending = False).head(10)

In [None]:
# Keep only those records where the value is less than 157
dados_treino = dados_treino[dados_treino.total_day_calls <= 157]
dados_treino.shape

In [None]:
# Boxplot
plt.figure(figsize = (10, 4))
sns.boxplot(dados_treino.total_intl_calls)

In [None]:
# Frequency count per value
dados_treino.total_intl_calls.sort_values(ascending = False).head(10)

In [None]:
# Keep only those records where the value is less than 15
dados_treino = dados_treino[dados_treino.total_intl_calls <= 15]
dados_treino.shape

In [None]:
# Boxplot
plt.figure(figsize = (10, 4))
sns.boxplot(dados_treino.number_customer_service_calls)

In [None]:
# Frequency count per value
dados_treino.number_customer_service_calls.sort_values(ascending = False).head(10)

In [None]:
# Keep only those records where the value is less than 6
dados_treino = dados_treino[dados_treino.number_customer_service_calls <= 6]
dados_treino.shape

In [None]:
dados_treino.sample(10)

In [None]:
dados_treino['account_length'].describe()

In [None]:
dados_treino.columns

In [None]:
num = ['account_length', 'international_plan', 'voice_mail_plan',
       'number_vmail_messages', 'total_day_calls', 'total_day_charge',
       'total_eve_calls', 'total_eve_charge', 'total_night_calls',
       'total_night_charge', 'total_intl_calls', 'total_intl_charge',
       'number_customer_service_calls', 'area_code_408',
       'area_code_415', 'area_code_510', 'Target']

In [None]:
corr_df2 = dados_treino[num].corr()

In [None]:
# Correlation (visual)
plt.figure(figsize = (14, 12))
sns.heatmap(corr_df2, cmap = 'Blues', annot = True, fmt = '.2f') #cmap = 'Reds'

In [None]:
dados_treino.corr()

In [None]:
dados_treino.describe()

In [None]:
# Split dataset - Output variable
y = dados_treino.Target

In [None]:
y

In [None]:
# Creates a separate object for the input variables
X = dados_treino.drop('Target', axis = 1)

In [None]:
X

In [None]:
print(X.shape, y.shape)

In [None]:
X_treino = X

In [None]:
y_treino = y

## Balancing the classes in the training dataset

In [None]:
y.value_counts()

In [None]:
# Installs the package
!pip install -q imblearn

In [None]:
# Load the SMOTE function
import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
# Let's apply the oversampling technique and increase the number of examples of the minority class
over_sampler = SMOTE(k_neighbors = 2)

In [None]:
# Apply oversampling (must be done with training data only)
X_res, y_res = over_sampler.fit_resample(X_treino, y_treino)

In [None]:
y_res.value_counts()

In [None]:
X_treino = X_res
y_treino = y_res

## Standardization of the training dataset

In [None]:
X_treino.head()

In [None]:
# We calculate mean and standard deviation of the training data
treino_mean = X_treino.mean()
treino_std = X_treino.std()
print(treino_mean)
print(treino_std)

In [None]:
# Standardization
X_treino = (X_treino - treino_mean) / treino_std

In [None]:
X_treino.head()

In [None]:
# Describe
X_treino.describe()

# Preparing the test data

In [None]:
dados_treino.columns

In [None]:
dados_teste.shape

In [None]:
#Rename the variable target
dados_teste.rename({'churn':'Target'}, axis = 'columns', inplace = True)

In [None]:
dados_teste.columns

In [None]:
# Applies the function
dados_teste['Target'] = dados_teste['Target'].map(encoding_func)

In [None]:
dados_teste.sample(5)

In [None]:
# Function for label encoding for international_plan -> 0 = no and 1 = yes
# Apply function
dados_teste['international_plan'] = dados_teste['international_plan'].map(encoding_func)

In [None]:
# Function for label encoding for voice_mail_plan -> 0 = no and 1 = yes
# Apply the function
dados_teste['voice_mail_plan'] = dados_teste['voice_mail_plan'].map(encoding_func)

In [None]:
dados_teste.sample(5)

In [None]:
# Applying One-Hot Encoding
for cat in ['area_code']:
    onehots = pd.get_dummies(dados_teste[cat], prefix = cat)
    dados_teste = dados_teste.join(onehots)

In [None]:
dados_teste.columns

In [None]:
dados_teste = dados_teste.drop(columns = ['Unnamed: 0', 
                        'state', 
                        'area_code'])

In [None]:
# Removing the total_day_minutes, total_eve_minutes, total_night_minutes and total_intl_minutes columns to avoid correlation
dados_teste = dados_teste.drop(columns = ['total_day_minutes', 
                                            'total_eve_minutes', 
                                            'total_night_minutes',
                                            'total_intl_minutes'])

In [None]:
#Rename the variable area_code
dados_teste.rename({'area_code_area_code_408':'area_code_408','area_code_area_code_415':'area_code_415','area_code_area_code_510':'area_code_510'}, axis = 'columns', inplace = True)

In [None]:
dados_teste.columns

In [None]:
y_teste = dados_teste.Target

In [None]:
# Creates a separate object for the input variables
X_teste = dados_teste.drop('Target', axis = 1)

In [None]:
# We use training mean and variance to standardize the test data set
X_teste = (X_teste - treino_mean) / treino_std

# Logistic Regression Model

In [None]:
# Set hyperparameter list
tuned_params_v1 = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 
                   'penalty': ['l1', 'l2']}

In [None]:
# We will create the model with GridSearch 
# Several models will be created with different combinations of hyperparameters
modelo_v1 = GridSearchCV(LogisticRegression(), 
                         tuned_params_v1, 
                         scoring = 'roc_auc', 
                         n_jobs = -1)

In [None]:
# Model training
modelo_v1.fit(X_treino, y_treino)

In [None]:
# We select the best model
modelo_v1.best_estimator_

In [None]:
# Predictions with test data
y_pred_v1 = modelo_v1.predict(X_teste)

In [None]:
# Show the top 10 predictions
y_pred_v1[:10]

In [None]:
# We get the predictions in probability format for each class
y_pred_proba_v1 = modelo_v1.predict_proba(X_teste)

In [None]:
# Show the top 10 predictions
y_pred_proba_v1[:10]

In [None]:
# We get the predictions in probability format by filtering for the positive class
# We need this to calculate the ROC curve
y_pred_proba_v1 = modelo_v1.predict_proba(X_teste)[:,1]

In [None]:
# Show the top 10 predictions
y_pred_proba_v1[:10]

In [None]:
# As an example, let's check one of the data points (change the value of i if you wish)
i = 16 
print('For data point {}, actual class = {}, predicted class = {}, predicted probability = {}'.
      format(i, y_teste.iloc[i], y_pred_v1[i], y_pred_proba_v1[i]))

In [None]:
# Confusion matrix
confusion_matrix(y_teste, y_pred_v1)

In [None]:
# Extracting each value from the CM
tn, fp, fn, tp = confusion_matrix(y_teste, y_pred_v1).ravel()

In [None]:
print(tn, fp, fn, tp)

In [None]:
# Calculate overall AUC (Area Under The Curve) metric with actual data and predictions under test
roc_auc_v1 = roc_auc_score(y_teste, y_pred_v1)
print(roc_auc_v1)

In [None]:
# Calculate the ROC curve with data and predictions under test
fpr_v1, tpr_v1, thresholds = roc_curve(y_teste, y_pred_proba_v1)

In [None]:
# AUC in test
auc_v1 = auc(fpr_v1, tpr_v1)
print(auc_v1)

In [None]:
# Test Accuracy
acuracia_v1 = accuracy_score(y_teste, y_pred_v1)
print(acuracia_v1)

### Feature Importance

In [None]:
# Building the model again with the best hyperparameters
# This is necessary because the final version should not have GridSearchCV
modelo_v1 = LogisticRegression(C = 1)
modelo_v1.fit(X_treino, y_treino)

In [None]:
# We get the coefficients by largest using np.argsort
indices = np.argsort(-abs(modelo_v1.coef_[0,:]))

In [None]:
print("Most important variables for the model result_v1:")
print(50*'-')
for feature in X.columns[indices]:
    print(feature)

In [None]:
# Save the template to disk
with open('modelos/modelo_regressao.pkl', 'wb') as pickle_file:
      joblib.dump(modelo_v1, 'modelos/modelo_regressao.pkl') 

## Model V2

In [None]:
dados_treino.columns

In [None]:
# Removing the total_day_minutes, total_eve_minutes, total_night_minutes and total_intl_minutes columns to avoid correlation
dados_treino = dados_teste.drop(columns = ['area_code_408', 
                                            'area_code_510', 
                                            'area_code_415'])

In [None]:
dados_treino.columns

In [None]:
dados_treino.sample(5)

In [None]:
# Describe
X_treino.describe()

In [None]:
# Describe
y_treino.describe()

In [None]:
# Removing the total_day_minutes, total_eve_minutes, total_night_minutes and total_intl_minutes columns to avoid correlation
X_treino = X_treino.drop(columns = ['area_code_408', 
                                            'area_code_510', 
                                            'area_code_415'])

In [None]:
# Describe
X_treino.describe()

### Do the same thing with Test Data

In [None]:
X_teste.sample(5)

In [None]:
# Removing the total_day_minutes, total_eve_minutes, total_night_minutes and total_intl_minutes columns to avoid correlation
X_teste = X_teste.drop(columns = ['area_code_408', 
                                            'area_code_510', 
                                            'area_code_415'])

## Prediction V2 

In [None]:
# Define hyperparameter list
tuned_params_v1 = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 
                   'penalty': ['l1', 'l2']}

In [None]:
# We will create the model with GridSearch 
# Several models will be created with different combinations of hyperparameters
modelo_v1 = GridSearchCV(LogisticRegression(), 
                         tuned_params_v1, 
                         scoring = 'roc_auc', 
                         n_jobs = -1)

In [None]:
# Model training
modelo_v1.fit(X_treino, y_treino)

In [None]:
# We select the best model
modelo_v1.best_estimator_

In [None]:
# Show the top 10 predictions
y_pred_v1[:10]

In [None]:
# We get the predictions in probability format for each class
y_pred_proba_v1 = modelo_v1.predict_proba(X_teste)

In [None]:
# Show the top 10 predictions
y_pred_proba_v1[:10]

In [None]:
# We get the predictions in probability format by filtering for the positive class
# We need this to calculate the ROC curve
y_pred_proba_v1 = modelo_v1.predict_proba(X_teste)[:,1]

In [None]:
# Show the top 10 predictions
y_pred_proba_v1[:10]

In [None]:
# As an example, let's check one of the data points (change the value of i if you wish)
i = 16 
print('For data point {}, actual class = {}, predicted class = {}, predicted probability = {}'.
      format(i, y_teste.iloc[i], y_pred_v1[i], y_pred_proba_v1[i]))

In [None]:
# Confusion matrix
confusion_matrix(y_teste, y_pred_v1)

In [None]:
# Extracting each value from the CM
tn, fp, fn, tp = confusion_matrix(y_teste, y_pred_v1).ravel()

In [None]:
print(tn, fp, fn, tp)

In [None]:
# Calculate overall AUC (Area Under The Curve) metric with actual data and predictions under test
roc_auc_v1 = roc_auc_score(y_teste, y_pred_v1)
print(roc_auc_v1)

In [None]:
# Calculate overall AUC (Area Under The Curve) metric with actual data and predictions under test
fpr_v1, tpr_v1, thresholds = roc_curve(y_teste, y_pred_proba_v1)

In [None]:
# AUC in test
auc_v1 = auc(fpr_v1, tpr_v1)
print(auc_v1)

In [None]:
# Test Accuracy
acuracia_v1 = accuracy_score(y_teste, y_pred_v1)
print(acuracia_v1)

## Model V1 with 5 variables

In [None]:
'''international_plan, voice_mail_plan, total_day_charge, total_eve_charge, number_customer

voice_mail_plan
total_day_charge
number_customer_service_calls
international_plan
number_vmail_messages'''

In [None]:
X_treino.columns

In [None]:
X_treino = X_treino.drop(columns = ['account_length', 
                                    'total_day_calls', 
                                    'total_eve_calls',
                                    'total_eve_charge',
                                    'total_night_calls',
                                    'total_night_charge',
                                    'total_intl_calls',
                                    'total_intl_charge'])

In [None]:
X_treino.columns

In [None]:
X_teste.columns

In [None]:
X_teste = X_teste.drop(columns = ['account_length', 
                                    'total_day_calls', 
                                    'total_eve_calls',
                                    'total_eve_charge',
                                    'total_night_calls',
                                    'total_night_charge',
                                    'total_intl_calls',
                                    'total_intl_charge'])

In [None]:
# Define hyperparameter list
tuned_params_v1 = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 
                   'penalty': ['l2']}

In [None]:
# We will create the model with GridSearch 
# Several models will be created with different combinations of hyperparameters
modelo_v1 = GridSearchCV(LogisticRegression(), 
                         tuned_params_v1, 
                         scoring = 'roc_auc')

In [None]:
# Model training
modelo_v1.fit(X_treino, y_treino)

In [None]:
# We select the best model
modelo_v1.best_estimator_

In [None]:
# Show the top 10 predictions
y_pred_v1[:10]

In [None]:
# We get the predictions in probability format for each class
y_pred_proba_v1 = modelo_v1.predict_proba(X_teste)

In [None]:
# Show the top 10 predictions
y_pred_proba_v1[:10]

In [None]:
# We get the predictions in probability format by filtering for the positive class
# We need this to calculate the ROC curve
y_pred_proba_v1 = modelo_v1.predict_proba(X_teste)[:,1]

In [None]:
# Show the top 10 predictions
y_pred_proba_v1[:10]

In [None]:
# As an example, let's check one of the data points (change the value of i if you wish)
i = 16 
print('For data point {}, actual class = {}, predicted class = {}, predicted probability = {}'.
      format(i, y_teste.iloc[i], y_pred_v1[i], y_pred_proba_v1[i]))

In [None]:
# Confusion matrix
confusion_matrix(y_teste, y_pred_v1)

In [None]:
# Extracting each value from the CM
tn, fp, fn, tp = confusion_matrix(y_teste, y_pred_v1).ravel()

In [None]:
# Calculate overall AUC (Area Under The Curve) metric with actual data and predictions under test
roc_auc_v1 = roc_auc_score(y_teste, y_pred_v1)
print(roc_auc_v1)

In [None]:
# Calculate the ROC curve with data and predictions under test
fpr_v1, tpr_v1, thresholds = roc_curve(y_teste, y_pred_proba_v1)

In [None]:
# AUC in test
auc_v1 = auc(fpr_v1, tpr_v1)
print(auc_v1)

In [None]:
# Test Accuracy
acuracia_v1 = accuracy_score(y_teste, y_pred_v1)
print(acuracia_v1)

In [None]:
# Create a dataframe to receive the metrics for each model
df_modelos = pd.DataFrame()

In [None]:
# Dictionary with model_v1 metrics
dict_modelo_v1 = {'Nome': 'modelo_v1', 
                  'Algoritmo': 'Regressão Logística', 
                  'ROC_AUC Score': roc_auc_v1,
                  'AUC Score': auc_v1,
                  'Acurácia': acuracia_v1}

In [None]:
# Add dict to dataframe
df_modelos = df_modelos.append(dict_modelo_v1, ignore_index = True)

In [None]:
display(df_modelos)

# Random Forest Model

In [None]:
X_treino.sample(5)

In [None]:
# Hyperparameter grid
tuned_params_v2 = {'n_estimators': [100, 200, 300, 400, 500], 
                   'min_samples_split': [2, 5, 10], 
                   'min_samples_leaf': [1, 2, 4]}

In [None]:
# Create the model with RandomizedSearchCV to search for the best combination of hyperparameters
modelo_v2 = RandomizedSearchCV(RandomForestClassifier(), 
                               tuned_params_v2, 
                               n_iter = 15, 
                               scoring = 'roc_auc', 
                               n_jobs  = -1)

In [None]:
# Model training
modelo_v2.fit(X_treino, y_treino)

In [None]:
# Extract the best model
modelo_v2.best_estimator_

In [None]:
# Predictions under test
y_pred_v2 = modelo_v2.predict(X_teste)

In [None]:
# Get the predictions for the positive class
y_pred_proba_v2 = modelo_v2.predict_proba(X_teste)[:,1]

In [None]:
# Confusion matrix
confusion_matrix(y_teste, y_pred_v2)

In [None]:
# ROC curve in data and predictions under test
roc_auc_v2 = roc_auc_score(y_teste, y_pred_v2)
print(roc_auc_v2)

In [None]:
# ROC curve in data and predictions under test
fpr_v2, tpr_v2, thresholds = roc_curve(y_teste, y_pred_proba_v2)

In [None]:
# AUC in test
auc_v2 = auc(fpr_v2, tpr_v2)
print(auc_v2)

In [None]:
# Test Accuracy
acuracia_v2 = accuracy_score(y_teste, y_pred_v2)
print(acuracia_v2)

In [None]:
# Save the template to disk
with open('modelos/modelo_random_forest.pkl', 'wb') as pickle_file:
      joblib.dump(modelo_v1, 'modelos/modelo_random_forest.pkl') 

In [None]:
# Dictionary with model_v2 metrics
dict_modelo_v2 = {'Nome': 'modelo_randomForest', 
                  'Algoritmo': 'Random Forest', 
                  'ROC_AUC Score': roc_auc_v2,
                  'AUC Score': auc_v2,
                  'Acurácia': acuracia_v2}

In [None]:
# Add dict to dataframe
df_modelos = df_modelos.append(dict_modelo_v2, ignore_index = True)

In [None]:
display(df_modelos)

# Model 3 with KNN

In [None]:
# List of possible values of K
vizinhos = list(range(1, 20, 2))

In [None]:
# List for the scores
cv_scores = []

In [None]:
# Cross-validation to determine the best value of k
for k in vizinhos:
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X_treino, y_treino, cv = 5, scoring = 'accuracy')
    cv_scores.append(scores.mean())  

In [None]:
# Adjusting the classification error
erro = [1 - x for x in cv_scores]

In [None]:
# Determining the best value of k (with smallest error)
optimal_k = vizinhos[erro.index(min(erro))]
print('O valor ideal de k é %d' % optimal_k)

In [None]:
# Create the model version 3
modelo_v3 = KNeighborsClassifier(n_neighbors = optimal_k)

In [None]:
# Model training
modelo_v3.fit(X_treino, y_treino)

In [None]:
# Predictions under test
y_pred_v3 = modelo_v3.predict(X_teste)

In [None]:
# Confusion matrix
confusion_matrix(y_teste, y_pred_v3)

In [None]:
# Positive class probability prediction
y_pred_proba_v3 = modelo_v3.predict_proba(X_teste)[:,1]

In [None]:
# Calculate ROC_AUC on test
roc_auc_v3 = roc_auc_score(y_teste, y_pred_v3)
print(roc_auc_v3)

In [None]:
# Calculate ROC curve
fpr_v3, tpr_v3, thresholds = roc_curve(y_teste, y_pred_proba_v3)

In [None]:
# AUC in test
auc_v3 = auc(fpr_v3, tpr_v3)
print(auc_v3)

In [None]:
# Test Accuracy
acuracia_v3 = accuracy_score(y_teste, y_pred_v3)
print(acuracia_v3)

In [None]:
# Save the template to disk
with open('modelos/modelo_knn.pkl', 'wb') as pickle_file:
      joblib.dump(modelo_v3, 'modelos/modelo_knn.pkl') 

In [None]:
# Dictionary with model_v3 metrics
dict_modelo_v3 = {'Nome': 'modelo_knn', 
                  'Algoritmo': 'KNN', 
                  'ROC_AUC Score': roc_auc_v3,
                  'AUC Score': auc_v3,
                  'Acurácia': acuracia_v3}

In [None]:
# Add dict to dataframe
df_modelos = df_modelos.append(dict_modelo_v3, ignore_index = True)

In [None]:
display(df_modelos)

# Model 4 with Decision Tree

In [None]:
# Hyperparameters
tuned_params_v4 = {'min_samples_split': [2, 3, 4, 5, 7], 
                   'min_samples_leaf': [1, 2, 3, 4, 6], 
                   'max_depth': [2, 3, 4, 5, 6, 7]}

In [None]:
# Create the model with RandomizedSearchCV
modelo_v4 = RandomizedSearchCV(DecisionTreeClassifier(), 
                               tuned_params_v4, 
                               n_iter = 15, 
                               scoring = 'roc_auc', 
                               n_jobs = -1)

In [None]:
# Model training
modelo_v4.fit(X_treino, y_treino)

In [None]:
# Extract the best model
modelo_v4.best_estimator_

In [None]:
# Predictions under test
y_pred_v4 = modelo_v4.predict(X_teste)

In [None]:
# Probability predictions
y_pred_proba_v4 = modelo_v4.predict_proba(X_teste)[:,1]

In [None]:
# Confusion matrix
confusion_matrix(y_teste, y_pred_v4)

In [None]:
# Calculates ROC AUC score
roc_auc_v4 = roc_auc_score(y_teste, y_pred_v4)
print(roc_auc_v4)

In [None]:
# ROC Curve
fpr_v4, tpr_v4, thresholds = roc_curve(y_teste, y_pred_proba_v4)

In [None]:
# AUC in test
auc_v4 = auc(fpr_v4, tpr_v4)
print(auc_v4)

In [None]:
# Test Accuracy
acuracia_v4 = accuracy_score(y_teste, y_pred_v4)
print(acuracia_v4)

In [None]:
# Save the template to disk
with open('modelos/modelo_decision_tree.pkl', 'wb') as pickle_file:
      joblib.dump(modelo_v4, 'modelos/modelo_decision_tree.pkl') 

In [None]:
# Dictionary with model_v4 metrics
dict_modelo_v4 = {'Nome': 'modelo_decisionTree', 
                  'Algoritmo': 'Decision Tree', 
                  'ROC_AUC Score': roc_auc_v4,
                  'AUC Score': auc_v4,
                  'Acurácia': acuracia_v4}

In [None]:
# Add dict to dataframe
df_modelos = df_modelos.append(dict_modelo_v4, ignore_index = True)

In [None]:
display(df_modelos)

# Model 5 with SVM

In [None]:
# Function for hyperparameter selection
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(SVC(kernel = 'rbf'), param_grid, cv = nfolds)
    grid_search.fit(X_treino, y_treino)
    grid_search.best_params_
    return grid_search.best_params_

In [None]:
# Apply the function
svc_param_selection(X_treino, y_treino, 5)

In [None]:
# Create the model with the best hyperparameters
modelo_v5 = SVC(C = 1, gamma = 1, probability = True)

In [None]:
# Model training
modelo_v5.fit(X_treino, y_treino)

In [None]:
# Predictions under test
y_pred_v5 = modelo_v5.predict(X_teste)

In [None]:
confusion_matrix(y_teste, y_pred_v5)

In [None]:
# Probability predictions
y_pred_proba_v5 = modelo_v5.predict_proba(X_teste)[:, 1]

In [None]:
# Calculates ROC AUC score
roc_auc_v5 = roc_auc_score(y_teste, y_pred_v5)
print(roc_auc_v5)

In [None]:
# Calculate ROC curve
fpr_v5, tpr_v5, thresholds = roc_curve(y_teste, y_pred_proba_v5)

In [None]:
# AUC in test
auc_v5 = auc(fpr_v5, tpr_v5)
print(auc_v5)

In [None]:
# Test Accuracy
acuracia_v5 = accuracy_score(y_teste, y_pred_v5)
print(acuracia_v5)

In [None]:
# Save the template to disk
with open('modelos/modelo_svm.pkl', 'wb') as pickle_file:
      joblib.dump(modelo_v5, 'modelos/modelo_svm.pkl') 

In [None]:
# Dictionary with model_v5 metrics
dict_modelo_v5 = {'Nome': 'modelo_svm', 
                  'Algoritmo': 'SVM', 
                  'ROC_AUC Score': roc_auc_v5,
                  'AUC Score': auc_v5,
                  'Acurácia': acuracia_v5}

In [None]:
# Add dict to dataframe
df_modelos = df_modelos.append(dict_modelo_v5, ignore_index = True)

In [None]:
display(df_modelos)

## Best Model Selection

In [None]:
# We will use the model with the highest AUC Score, because it is a global metric
# The AUC Score is ideal for comparing models from different algorithms
df_melhor_modelo = df_modelos[df_modelos['AUC Score'] == df_modelos['AUC Score'].max()]

In [None]:
df_modelos['AUC Score'].max()

In [None]:
df_melhor_modelo