In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

# Carrega os dados que geramos
df = pd.read_csv('../data/customers.csv')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         5000 non-null   object 
 1   TenureMonths       5000 non-null   int64  
 2   ContractType       5000 non-null   object 
 3   MonthlyCharges     5000 non-null   float64
 4   SupportTickets     5000 non-null   int64  
 5   FeatureUsageScore  5000 non-null   int64  
 6   TotalCharges       5000 non-null   float64
 7   Churn              5000 non-null   int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 312.6+ KB


Unnamed: 0,CustomerID,TenureMonths,ContractType,MonthlyCharges,SupportTickets,FeatureUsageScore,TotalCharges,Churn
0,CUST-0000,43,One year,102.791679,3,40,4852.709454,0
1,CUST-0001,66,Month-to-month,86.388938,0,55,6031.000057,0
2,CUST-0002,3,Month-to-month,108.995329,6,63,304.86323,0
3,CUST-0003,45,One year,51.087282,5,29,1849.294654,0
4,CUST-0004,17,One year,64.689449,0,88,1008.802445,0


In [2]:
from sklearn.model_selection import train_test_split

# Define a variável alvo (y) e as features (X)
target = 'Churn'
features = [col for col in df.columns if col not in [target, 'CustomerID']]

X = df[features]
y = df[target]

# Divide os dados em conjuntos de treino e teste
# Usamos random_state para garantir que a divisão seja sempre a mesma (reprodutibilidade)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Formato de X_train:", X_train.shape)
print("Formato de X_test:", X_test.shape)

Formato de X_train: (4000, 6)
Formato de X_test: (1000, 6)


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression # Ou qualquer outro modelo, como RandomForestClassifier

# Identifica as colunas numéricas e categóricas
# Baseado no seu df.info(), 'ContractType' é categórica. As outras são numéricas.
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Cria o transformador de pré-processamento
# Ele aplica transformações diferentes para colunas diferentes
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Cria o pipeline final que primeiro pré-processa os dados e depois treina o modelo
# Usaremos Regressão Logística como exemplo
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(random_state=42))])

# Treina o pipeline completo com os dados de treino
print("Iniciando o treinamento do pipeline...")
pipeline.fit(X_train, y_train)
print("Treinamento concluído!")

# (Opcional) Avalia o modelo no conjunto de teste para ver a performance
accuracy = pipeline.score(X_test, y_test)
print(f"Acurácia no conjunto de teste: {accuracy:.4f}")

Iniciando o treinamento do pipeline...
Treinamento concluído!
Acurácia no conjunto de teste: 0.8930


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [4]:
import joblib
import os

# Define o caminho de destino para salvar o modelo
# O caminho é '../src/' para sair da pasta 'notebooks' e entrar na 'src'
output_dir = '../src'
model_path = os.path.join(output_dir, 'churn_model.joblib')

# Garante que o diretório de destino exista
os.makedirs(output_dir, exist_ok=True)

# Salva o objeto do pipeline no arquivo
print(f"Salvando o modelo treinado em: {model_path}")
joblib.dump(pipeline, model_path)
print("Modelo salvo com sucesso!")

Salvando o modelo treinado em: ../src/churn_model.joblib
Modelo salvo com sucesso!
