### Segmentacao de clientes

Conjunto de dados utilizado: https://www.kaggle.com/datasets/ermismbatuhan/digital-marketing-ecommerce-customer-behavior 

Segmentação de usuários de E-Commerce com base em seus comportamentos e características usando 3 técnicas de machine learning:  Random Forest, XGBoost e Redes Neurais.

Isso pode ajudar na personalização de serviços ou estratégias de marketing específicas para cada grupo.

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense


# <h4>1. Carregando o conjunto de dados</h4>

In [2]:
df = pd.read_csv('data1.csv', sep=';', decimal=',')
df.head()

Unnamed: 0,account length,location code,user id,credit card info save,push status,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,avg order value,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls,churn
0,128,415,3824657,no,yes,25,265,45,17,110,197,87,244.7,91,11.01,10.0,3,2.7,1,0
1,107,415,3717191,no,yes,26,162,27,17,123,196,103,254.4,103,11.45,13.7,3,3.7,1,0
2,137,415,3581921,no,no,0,243,41,10,114,121,110,162.6,104,7.32,12.2,5,3.29,0,0
3,84,408,3759999,yes,no,0,299,51,5,71,62,88,196.9,89,8.86,6.6,7,1.78,2,0
4,75,415,3306626,yes,no,0,167,28,13,113,148,122,186.9,121,8.41,10.1,3,2.73,3,0


In [3]:
# Informação sobre o conjunto de dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   account length                       3333 non-null   int64  
 1   location code                        3333 non-null   int64  
 2   user id                              3333 non-null   int64  
 3   credit card info save                3333 non-null   object 
 4   push status                          3333 non-null   object 
 5   add to wishlist                      3333 non-null   int64  
 6   desktop sessions                     3333 non-null   int64  
 7   app sessions                         3333 non-null   int64  
 8   desktop transactions                 3333 non-null   int64  
 9   total product detail views           3333 non-null   int64  
 10  session duration                     3333 non-null   int64  
 11  promotion clicks              

In [4]:
# Primeiro vamos remover as colunas que não serão usadas
columns_to_remove = ['account length', 'location code', 'user id']
df = df.drop(columns=columns_to_remove, axis=1)

Transformar colunas categóricas em numéricas

In [5]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = ['credit card info save', 'push status']

for column in categorical_columns:
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])

# <h4>2. Separação em treino e teste</h4>

In [6]:
# Primeiro vamos escalonar o conjunto de dados
numeric_columns = df.columns.difference(categorical_columns)

scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])


In [11]:
df.head()

Unnamed: 0,credit card info save,push status,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,avg order value,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls,churn
0,0,1,1.234883,1.56455,1.557193,-0.020265,0.476643,-0.079666,-0.658138,0.866743,-0.465494,0.866029,-0.085008,-0.601195,-0.08569,-0.427932,-0.411672
1,0,1,1.307948,-0.32713,-0.384977,-0.020265,1.124503,-0.099387,0.145038,1.058571,0.147825,1.05939,1.240482,-0.601195,1.241169,-0.427932,-0.411672
2,0,0,-0.59176,1.160502,1.1256,-1.639456,0.675985,-1.57848,0.496427,-0.756869,0.198935,-0.755571,0.703121,0.211534,0.697156,-1.188218,-0.411672
3,1,0,-0.59176,2.188988,2.204583,-2.796021,-1.466936,-2.742033,-0.607939,-0.078551,-0.567714,-0.078806,-1.303026,1.024263,-1.306401,0.332354,-0.411672
4,1,0,-0.59176,-0.235301,-0.277078,-0.945517,0.626149,-1.046007,1.098809,-0.276311,1.067803,-0.276562,-0.049184,-0.601195,-0.045885,1.092641,-0.411672


In [12]:
# Como a coluna churn não está mais com 0 e 1 (binária) vamos refazer o label encoding só para ela
label_encoder = LabelEncoder()
df['churn'] = label_encoder.fit_transform(df['churn'])

In [14]:
# Selecionando as variáveis para Treino e teste
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# <h4>3. Segmentação dos Clientes</h4>

1. Criação dos modelos

In [15]:
# K-Means
k = 3

kmeans = KMeans(n_clusters=k, random_state=42)

# XGBoost
xgb_model = XGBClassifier()

# Redes Neurais
rn_model = Sequential()
rn_model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
rn_model.add(Dense(32, activation='relu'))
rn_model.add(Dense(16, activation='relu'))
rn_model.add(Dense(1, activation='sigmoid'))   
rn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])       

2. Treinamento dos modelos

In [19]:
kmeans.fit(X_train, y_train)
#xgb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
rn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f73d095ff0>

# <h4>4. Previsão</h4>

In [21]:
kmeans_pred = kmeans.predict(X_test)
xgb_pred = xgb_model.predict(X_test)
rn_pred = rn_model.predict(X_test)

# <h4>5. Avaliação dos modelos</h4>

In [25]:
# KMeans
report = classification_report(y_test, kmeans_pred)
print('Relatório de Classificação (KMeans:')
print(report)

Relatório de Classificação (KMeans:
              precision    recall  f1-score   support

           0       0.89      0.36      0.51       566
           1       0.20      0.44      0.27       101
           2       0.00      0.00      0.00         0

    accuracy                           0.37       667
   macro avg       0.36      0.26      0.26       667
weighted avg       0.79      0.37      0.47       667



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
print(f'Relatório de Classificação (XGBoost): {classification_report(y_test, xgb_pred)}')

Relatório de Classificação (XGBoost):               precision    recall  f1-score   support

           0       0.96      0.99      0.97       566
           1       0.92      0.77      0.84       101

    accuracy                           0.96       667
   macro avg       0.94      0.88      0.91       667
weighted avg       0.95      0.96      0.95       667



In [32]:
# Rede Neural
# Aplicar um limiar de decisão (por exemplo, 0.5) para transformar previsões contínuas em binárias
threshold = 0.5
rn_pred_binary = np.where(rn_pred > threshold, 1, 0)

print(f'Matriz de Confusão:\n{confusion_matrix(y_test, rn_pred_binary)}')

Matriz de Confusão:
[[560   6]
 [ 33  68]]


# <h4>6. Validação Cruzada</h4>

In [46]:
# XGBoost
scores = cross_val_score(xgb_model, X, y, cv=5, scoring='accuracy')

#Imprima as pontuações médias
print("Pontuações de validação cruzada:", scores)
print(f"Precisão média: {scores.mean() * 100:.2f}%")

Pontuações de validação cruzada: [0.94602699 0.95352324 0.96101949 0.95945946 0.95945946]
Precisão média: 95.59%


In [56]:
# Rede Neural

# Função para criar e treinar o modelo de rede neural
def train_neural_network(X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
    
    # Avalie o modelo no conjunto de teste
    accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
    
    return accuracy

# Crie os k-folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Inicialize uma lista para armazenar as pontuações de validação cruzada
cv_scores = []

# Divida os dados em conjuntos de treinamento e teste para cada fold
for train_idx, test_idx in kfold.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Treine e avalie o modelo dentro do loop de validação cruzada
    accuracy = train_neural_network(X_train, y_train, X_test, y_test)
    
    cv_scores.append(accuracy)

# Converta as pontuações em um array numpy
cv_scores = np.array(cv_scores)

# Imprima as pontuações
print("Pontuações de validação cruzada:", cv_scores)
print(f"Precisão média: {cv_scores.mean() * 100:.2f}%")

Pontuações de validação cruzada: [0.92803597 0.94602698 0.91604197 0.9099099  0.91591591]
Precisão média: 92.32%


Pelos resultados vemos que o modelo XGBoots teve o melhor desempenho (95,59%), contra 92,32% da Rede Neural e 37% do KMeans.