In [None]:
# churn_analysis.ipynb


# 1️⃣ Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# 2️⃣ Load dataset
df = pd.read_csv("../data/churn.csv.csv")

# 3️⃣ Inspect column names
print("Original columns:", df.columns.tolist())

# 4️⃣ Strip spaces and lowercase all columns
df.columns = df.columns.str.strip().str.lower()
print("Cleaned columns:", df.columns.tolist())

# 5️⃣ Detect target column
target_candidates = [col for col in df.columns if 'churn' in col]
if len(target_candidates) == 0:
    raise ValueError("No column containing 'churn' found. Check your dataset.")
target_col = target_candidates[0]
print(f"Using '{target_col}' as target column")

# 6️⃣ Basic data check
print(df.head())
print(df.info())
print(df.isnull().sum())

# 7️⃣ Target distribution plot
sns.countplot(x=target_col, data=df)
plt.title("Churn Distribution")
plt.show()

# 8️⃣ Correlation heatmap (numeric columns)
plt.figure(figsize=(12,8))
sns.heatmap(df.select_dtypes(include=['int64','float64']).corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

# 9️⃣ Preprocessing
# Drop customerID if exists
if 'customerid' in df.columns:
    df = df.drop(['customerid'], axis=1)

# Encode categorical features
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Detect target column
target_candidates = [col for col in df.columns if 'churn' in col]
if len(target_candidates) == 0:
    raise ValueError("No column containing 'churn' found. Check your dataset.")
target_col = target_candidates[0]
y = df[target_col]

# Automatically map any two unique values to 0 and 1
unique_vals = y.unique()
if len(unique_vals) == 2:
    mapping = {unique_vals[0]: 0, unique_vals[1]: 1}
    y = y.map(mapping)
    print(f"Target column '{target_col}' mapped to binary: {y.unique()}")
else:
    raise ValueError(f"Target column '{target_col}' is not binary. Unique values: {unique_vals}")



# 1️⃣0️⃣ Train-test split
X = df.drop(target_col, axis=1)
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


# 1️⃣1️⃣ Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

# 1️⃣2️⃣ Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

# Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# 1️⃣3️⃣ Feature Importance
importances = model.feature_importances_
features = X.columns
feat_importances = pd.Series(importances, index=features)
feat_importances.sort_values().plot(kind='barh', figsize=(10,8))
plt.title("Feature Importance")
plt.show()
