In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# <h4>1. Preparação dos dados</h4>

In [2]:
# carregar os dados
df = pd.read_csv('data1.csv', sep=';', decimal=',')
df.head()

Unnamed: 0,account length,location code,user id,credit card info save,push status,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,avg order value,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls,churn
0,128,415,3824657,no,yes,25,265,45,17,110,197,87,244.7,91,11.01,10.0,3,2.7,1,0
1,107,415,3717191,no,yes,26,162,27,17,123,196,103,254.4,103,11.45,13.7,3,3.7,1,0
2,137,415,3581921,no,no,0,243,41,10,114,121,110,162.6,104,7.32,12.2,5,3.29,0,0
3,84,408,3759999,yes,no,0,299,51,5,71,62,88,196.9,89,8.86,6.6,7,1.78,2,0
4,75,415,3306626,yes,no,0,167,28,13,113,148,122,186.9,121,8.41,10.1,3,2.73,3,0


In [3]:
# Info dos dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   account length                       3333 non-null   int64  
 1   location code                        3333 non-null   int64  
 2   user id                              3333 non-null   int64  
 3   credit card info save                3333 non-null   object 
 4   push status                          3333 non-null   object 
 5   add to wishlist                      3333 non-null   int64  
 6   desktop sessions                     3333 non-null   int64  
 7   app sessions                         3333 non-null   int64  
 8   desktop transactions                 3333 non-null   int64  
 9   total product detail views           3333 non-null   int64  
 10  session duration                     3333 non-null   int64  
 11  promotion clicks              

In [4]:
# Removendo a coluna user id
df.drop('user id', axis=1, inplace=True)

In [5]:
# Codificando variáveis categórcias em numéricas
labelencoder = LabelEncoder()
df['credit card info save'] = labelencoder.fit_transform(df['credit card info save'])
df['push status'] = labelencoder.fit_transform(df['push status'])

In [6]:
# Defindo as variáveis preditoras (X) e a alvo (y)
X = df.drop('churn', axis=1)
y = df['churn']

# <h4>2. Separação dos dados em treino e teste e treino dos modelos</h4>

In [7]:
# Separar em dados de treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
# Criando e Treinando os modelos

# Regressão Logística
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Árvore de Decisão
dec_tree_model = DecisionTreeClassifier()
dec_tree_model.fit(X_train, y_train)

# Random Forest
rnd_forest_model = RandomForestClassifier()
rnd_forest_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


RandomForestClassifier()

# <h4>3. Previsão dos modelos</h4>

In [9]:
logistic_pred = logistic_model.predict(X_test)
dec_tree_pred = dec_tree_model.predict(X_test)
rnd_forest_pred = rnd_forest_model.predict(X_test)

# <h4>4. Avaliação dos modelos</h4>

In [19]:
print('Relatório de Classificação\n')

print(f'Regressão Logística:\n{classification_report(y_test,logistic_pred)}')
print(f'Matriz de confusão:\n{confusion_matrix(y_test, logistic_pred)}')
confusao = confusion_matrix(y_test, logistic_pred)
acertos = confusao[0, 0] + confusao[1, 1]
print(f'Total de acertos: {acertos}\n')

print(f'Árvore de Decisão:\n{classification_report(y_test,dec_tree_pred)}')
print(f'Matriz de confusão:\n{confusion_matrix(y_test, dec_tree_pred)}')
confusao = confusion_matrix(y_test, dec_tree_pred)
acertos = confusao[0, 0] + confusao[1, 1]
print(f'Total de acertos: {acertos}\n')

print(f'Random Forest:\n{classification_report(y_test,rnd_forest_pred)}')
print(f'Matriz de confusão:\n{confusion_matrix(y_test, rnd_forest_pred)}')
confusao = confusion_matrix(y_test, rnd_forest_pred)
acertos = confusao[0, 0] + confusao[1, 1]
print(f'Total de acertos: {acertos}')


Relatório de Classificação

Regressão Logística:
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       566
           1       0.57      0.04      0.07       101

    accuracy                           0.85       667
   macro avg       0.71      0.52      0.50       667
weighted avg       0.81      0.85      0.79       667

Matriz de confusão:
[[563   3]
 [ 97   4]]
Total de acertos: 567

Árvore de Decisão:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       566
           1       0.71      0.74      0.73       101

    accuracy                           0.92       667
   macro avg       0.83      0.84      0.84       667
weighted avg       0.92      0.92      0.92       667

Matriz de confusão:
[[536  30]
 [ 26  75]]
Total de acertos: 611

Random Forest:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       566
           1       0.9

# <h4>4. Interpretação dos Resultados</h4>

Com base nos resultados da avaliação dos modelos, o Random Forest foi o que melhor desempenho teve com acurácia de 94% e um total de acertos de 629.

# <h4>5. Implementação de Estratégias de Retenção</h4>

Com base nos insights obtidos com a análise de churn, podemos desenvolver estratégias para reter clientes. 

Isso pode envolver a criação de programas de fidelidade, melhorias no atendimento ao cliente ou ações direcionadas a clientes em risco de churn, com base nas características identificadas como relevantes pelos modelos. 