# 📉 Customer Churn Prediction Project

### 🎯 Objective
Build a classification model to predict whether a customer will churn.

In [None]:
# 📦 Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
# 📥 Load Dataset
df = pd.read_csv("Customer_data - customer_data.csv")
df.head()

## 🔍 Data Exploration

In [None]:
df.info()
df.describe(include='all')

In [None]:
# Check for missing values
df.isnull().sum()

## 🧹 Data Preprocessing

In [None]:
# Convert TotalCharges to numeric and handle missing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna()

# Drop customer ID
df = df.drop('customerID', axis=1)

In [None]:
# Encode binary categorical features
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# One-hot encode remaining categoricals
df = pd.get_dummies(df, drop_first=True)

## ✂️ Feature Selection & Splitting

In [None]:
# Define features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 🤖 Model Training

In [None]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

## 📊 Evaluation

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n📈 {model_name} Evaluation Report:")
    print(classification_report(y_true, y_pred))
    sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f"{model_name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")

## 💡 Insights

In [None]:
# Feature Importance from Random Forest
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Important Features")
plt.show()

### ✅ Conclusion
- Built models to predict churn probability.
- Identified key churn drivers like tenure, contract type, monthly charges.
- Random Forest provided better performance.