# Aula prática: Preparação dos modelos

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('churn.csv')

In [3]:
df.head(10)

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no
5,MO,147,area_code_415,yes,no,0,157.0,79,26.69,103.1,94,8.76,211.8,96,9.53,7.1,6,1.92,0,no
6,LA,117,area_code_408,no,no,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,no
7,WV,141,area_code_415,yes,yes,37,258.6,84,43.96,222.0,111,18.87,326.4,97,14.69,11.2,5,3.02,0,no
8,IN,65,area_code_415,no,no,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,yes
9,RI,74,area_code_415,no,no,0,187.7,127,31.91,163.4,148,13.89,196.0,94,8.82,9.1,5,2.46,0,no


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4250 entries, 0 to 4249
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          4250 non-null   object 
 1   account_length                 4250 non-null   int64  
 2   area_code                      4250 non-null   object 
 3   international_plan             4250 non-null   object 
 4   voice_mail_plan                4250 non-null   object 
 5   number_vmail_messages          4250 non-null   int64  
 6   total_day_minutes              4250 non-null   float64
 7   total_day_calls                4250 non-null   int64  
 8   total_day_charge               4250 non-null   float64
 9   total_eve_minutes              4250 non-null   float64
 10  total_eve_calls                4250 non-null   int64  
 11  total_eve_charge               4250 non-null   float64
 12  total_night_minutes            4250 non-null   f

In [5]:
df.churn.value_counts()

no     3652
yes     598
Name: churn, dtype: int64

## Vamos fazer algumas transformações rápidas...
* Deletar as três primeiras variáveis
* Transformar (binarizar) as variáveis: "international_plan"e "voice_mail_plan"

In [6]:
df.drop(columns=df.columns[:3], inplace=True) # Deletandos as três primeiras colunas (índices 0, 1 e 2)
df = pd.get_dummies(data=df, drop_first=True) # Transformando os atributos categóricos em binários
df.head(10)

Unnamed: 0,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,international_plan_yes,voice_mail_plan_yes,churn_yes
0,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0,1,0
1,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0,0,0
2,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,1,0,0
3,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,1,0,0
4,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,0,1,0
5,0,157.0,79,26.69,103.1,94,8.76,211.8,96,9.53,7.1,6,1.92,0,1,0,0
6,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,0,0,0
7,37,258.6,84,43.96,222.0,111,18.87,326.4,97,14.69,11.2,5,3.02,0,1,1,0
8,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,0,0,1
9,0,187.7,127,31.91,163.4,148,13.89,196.0,94,8.82,9.1,5,2.46,0,0,0,0


## Preparação do modelo para validação
* holdout
* holdout repetido
* k-fold cross validation (kf)
* leave-one-out (loo)

### Primeiro, separar conjuntos em X (input) e y (output class)

In [7]:
X = df.drop(columns="churn_yes")
y = df.churn_yes

In [8]:
np.shape(X), np.shape(y), np.shape(df)

((4250, 16), (4250,), (4250, 17))

### Holdout (80% treino e 20% teste)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test)

((3400, 16), (850, 16), (3400,), (850,))

In [11]:
X_train.head(5)

Unnamed: 0,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,international_plan_yes,voice_mail_plan_yes
1340,0,180.0,119,30.6,198.8,126,16.9,217.1,70,9.77,12.4,3,3.35,1,1,0
3790,35,190.7,100,32.42,209.7,107,17.82,202.5,101,9.11,12.4,5,3.35,1,0,1
2647,17,204.9,84,34.83,201.0,102,17.09,219.7,97,9.89,11.3,5,3.05,0,0,1
4009,0,202.0,100,34.34,168.9,90,14.36,231.8,119,10.43,3.4,4,0.92,1,0,0
3537,0,158.1,107,26.88,181.5,101,15.43,200.3,126,9.01,8.3,7,2.24,1,0,0


In [12]:
y_train.head(5)

1340    0
3790    0
2647    0
4009    0
3537    0
Name: churn_yes, dtype: uint8

### Holdout Repetido (10 repetições)

In [13]:
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # Aqui vocês colocam o algoritmo para treinar e testar...
    print(X_train.index)

Int64Index([3513, 1049, 3741, 1638, 1520, 1396,  137, 3413,   51, 1881,
            ...
             552, 2817, 3689, 2841,  358, 2096, 2494, 2660, 3572,  365],
           dtype='int64', length=3400)
Int64Index([2623, 3751, 3265, 1250, 3465, 1112, 3786,  728,   99, 1827,
            ...
            3093,  879, 1793, 3141, 2586, 3205, 1269, 4143,  311, 3663],
           dtype='int64', length=3400)
Int64Index([ 732, 3875, 3479,  985, 2535, 2543, 4087, 1024, 1314, 1939,
            ...
            1315, 2765, 3630,  792, 2850, 2019, 3649, 1985, 3704,  231],
           dtype='int64', length=3400)
Int64Index([ 889, 2220, 1172, 1792, 3535,   76, 3619, 3846,  497, 2890,
            ...
            2193, 2661,  815, 1045, 3488, 2024, 1946, 2183, 2004, 3671],
           dtype='int64', length=3400)
Int64Index([3925,  596, 2891, 1139, 2117, 1176, 3595,  633, 1664, 1680,
            ...
            3067, 1490, 2303, 3488, 2882,  641, 1765, 3878, 3018, 2395],
           dtype='int64', length=3400)


### k-fold cross validation utilizando k = 10

In [14]:
# precisamos importar as bibliotecas kfol, leave-one-out, cross_val_score e também uma técnica de mineração
from sklearn.model_selection import KFold, LeaveOneOut, cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [15]:
DT = DecisionTreeClassifier()

In [16]:
kf = KFold(n_splits=10)
loo = LeaveOneOut()

In [17]:
# Primeiro, utilizando o k-fold
acc_kf = cross_val_score(estimator=DT, X=X_train, y=y_train, cv=kf, scoring="accuracy")

In [18]:
acc_kf

array([0.91764706, 0.94117647, 0.92941176, 0.93529412, 0.93823529,
       0.90882353, 0.92941176, 0.90294118, 0.92058824, 0.90882353])

In [19]:
print(f"A acurácia média obtida pelos modelos é de {np.mean(acc_kf)}")

A acurácia média obtida pelos modelos é de 0.923235294117647


### Leave-one-out (k=n)

acc_loo = cross_val_score(estimator=DT, X=X_train, y=y_train, cv=loo, scoring="accuracy")

acc_loo

print(f"A acurácia média obtida pelos modelos é de {np.mean(acc_loo)}")

## Fazendo o balanceamento de classes (realizar no conjunto de treinamento)
* Random undersampling
* Random oversampling

In [20]:
# !pip install imbalanced-learn

In [21]:
from imblearn import under_sampling, over_sampling

In [23]:
y_train.value_counts()

0    2911
1     489
Name: churn_yes, dtype: int64

In [24]:
489/(2911+489)

0.1438235294117647

### Undersampling

In [25]:
# Retirando amostras da classe mais favorecida (neste caso churn_yes = 0)
under = under_sampling.RandomUnderSampler(sampling_strategy=0.9)

In [26]:
X_under, y_under = under.fit_resample(X_train, y_train)

In [27]:
np.shape(X_under), np.shape(y_under)

((1032, 16), (1032,))

In [28]:
y_under.value_counts()

0    543
1    489
Name: churn_yes, dtype: int64

In [30]:
489/(543)

0.9005524861878453

### Oversampling

In [31]:
# Reamostragem da classe menos favorecida (neste caso churn_yes = 1)
over = over_sampling.RandomOverSampler(sampling_strategy=0.9)

In [32]:
X_over, y_over = over.fit_resample(X_train, y_train)

In [33]:
np.shape(X_over), np.shape(y_over)

((5530, 16), (5530,))

In [34]:
y_over.value_counts()

0    2911
1    2619
Name: churn_yes, dtype: int64

In [35]:
2619/2911

0.8996908278941944

## Testando a acurácia do modelo com árvores de decisão e validação k-fold

In [36]:
acc_under = cross_val_score(estimator=DT, X=X_under, y=y_under, cv=kf, scoring="accuracy")
acc_over = cross_val_score(estimator=DT, X=X_over, y=y_over, cv=kf, scoring="accuracy")

In [37]:
print(f"Acurácia média utilizando conjunto original = {np.mean(acc_kf):.2f}")
print(f"Acurácia média utilizando Undersampling = {np.mean(acc_under):.2f}")
print(f"Acurácia média utilizando Oversampling = {np.mean(acc_over):.2f}")

Acurácia média utilizando conjunto original = 0.92
Acurácia média utilizando Undersampling = 0.78
Acurácia média utilizando Oversampling = 0.97


In [38]:
acc_under

array([0.74038462, 0.74038462, 0.74757282, 0.78640777, 0.81553398,
       0.7961165 , 0.83495146, 0.83495146, 0.80582524, 0.73786408])

In [39]:
acc_over

array([0.94755877, 0.93851718, 0.94213382, 0.95479204, 0.94575045,
       0.960217  , 0.98734177, 1.        , 1.        , 1.        ])