# 02: EDA, Cleaning, and Baseline Model (Decision Tree)

## Objective
To perform Exploratory Data Analysis (EDA), clean the dataset, and establish a baseline performance using a Decision Tree classifier.

## Methodology
1. **Modern Cleaning**: Use `src.preprocessing` for standardized cleaning.
2. **Visual EDA**: Leverage `src.eda` for churn distribution and correlation analysis.
3. **Baseline Model**: Train a simple Decision Tree to set a performance benchmark.

In [None]:
# 1. Setup
!pip install -q pandas numpy scikit-learn matplotlib seaborn transformers torch

import pandas as pd
from src.preprocessing import clean_data
from src.eda import plot_churn_distribution, plot_correlation_heatmap, plot_distribution_by_churn

df_raw = pd.read_csv("ecommerce_churn_llm_final.csv")
df = clean_data(df_raw)
df.head()

# 2. Exploratory Data Analysis

In [None]:
plot_churn_distribution(df)
plot_correlation_heatmap(df)
plot_distribution_by_churn(df, "tenure_days")

# 3. Baseline Model: Decision Tree

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

X = df.drop(columns=["customer_id", "churn", "customer_feedback", "support_chat_excerpt", "reason_for_low_activity"], errors="ignore")
y = df["churn"]

categorical_cols = X.select_dtypes(include=['object', 'string', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

dt_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=5, random_state=42))
])

dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

print(classification_report(y_test, y_pred))
print(f"ROC AUC: {roc_auc_score(y_test, dt_model.predict_proba(X_test)[:, 1]):.3f}")