# üìä Projeto de Previs√£o de Churn - Telco Customer Churn Dataset
Este notebook realiza uma an√°lise explorat√≥ria e previs√£o de churn usando dados reais de clientes de telecomunica√ß√µes. Inclui limpeza, modelagem e visualiza√ß√£o.

In [None]:
# Importa√ß√£o das bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# Upload do dataset (Google Colab)
from google.colab import files
uploaded = files.upload()

# Carregamento dos dados
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print("Shape do dataset:", df.shape)
df.head()


## üîç An√°lise Explorat√≥ria (EDA)

In [None]:
print(df.info())
print(df.describe())
print("\nTaxa de Churn:")
print(df['Churn'].value_counts(normalize=True))

plt.figure(figsize=(6,4))
sns.countplot(x='Churn', data=df)
plt.title('Distribui√ß√£o de Churn')
plt.show()

# Converter TotalCharges temporariamente para num√©rico para EDA
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Histogramas
for col in ['tenure', 'MonthlyCharges', 'TotalCharges']:
    plt.figure()
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f'Distribui√ß√£o de {col}')
    plt.show()

# Churn por tipo de contrato
sns.countplot(x='Contract', hue='Churn', data=df)
plt.title('Churn por Tipo de Contrato')
plt.show()


## üßπ Limpeza e Prepara√ß√£o dos Dados

In [None]:
# Remover coluna irrelevante
df.drop('customerID', axis=1, inplace=True)

# Converter TotalCharges e preencher nulos
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Converter alvo e vari√°veis categ√≥ricas
df['Churn'] = LabelEncoder().fit_transform(df['Churn'])
for col in df.columns:
    if df[col].dtype == 'object':
        if df[col].nunique() == 2:
            df[col] = LabelEncoder().fit_transform(df[col])
        else:
            df = pd.get_dummies(df, columns=[col], drop_first=True)


## ü§ñ Treinamento dos Modelos

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


## üìà Avalia√ß√£o e Resultados

In [None]:
print("=== Regress√£o Log√≠stica ===")
print(classification_report(y_test, y_pred_log))
print("AUC:", roc_auc_score(y_test, log_model.predict_proba(X_test)[:,1]))

print("\n=== Random Forest ===")
print(classification_report(y_test, y_pred_rf))
print("AUC:", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:,1]))

fig, ax = plt.subplots(1, 2, figsize=(12,5))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Matriz de Confus√£o - Random Forest')

fpr, tpr, _ = roc_curve(y_test, rf_model.predict_proba(X_test)[:,1])
ax[1].plot(fpr, tpr, label=f'ROC (AUC = {roc_auc_score(y_test, rf_model.predict_proba(X_test)[:,1]):.3f})')
ax[1].plot([0,1],[0,1],'--')
ax[1].legend()
plt.show()

importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
importances[:10].plot(kind='barh')
plt.title('Top 10 Vari√°veis mais Importantes - Random Forest')
plt.gca().invert_yaxis()
plt.show()
