# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Load Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../../datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

# Data Cleaning

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Convert TotalCharges to numeric, handle errors
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill missing TotalCharges with 0 (assuming new customers)
df['TotalCharges'].fillna(0, inplace=True)

# Remove duplicates if any
df.drop_duplicates(inplace=True)

# Check data types
print("\nData types:")
print(df.dtypes)

# Basic statistics
df.describe()

# Exploratory Data Analysis

In [None]:
# Overall churn rate
churn_rate = df['Churn'].value_counts(normalize=True)['Yes'] * 100
print(f"Overall churn rate: {churn_rate:.2f}%")

# Churn by gender
sns.countplot(x='gender', hue='Churn', data=df)
plt.title('Churn by Gender')
plt.show()

# Churn by contract type
sns.countplot(x='Contract', hue='Churn', data=df)
plt.title('Churn by Contract Type')
plt.show()

# Churn by internet service
sns.countplot(x='InternetService', hue='Churn', data=df)
plt.title('Churn by Internet Service')
plt.show()

# Correlation heatmap for numeric columns
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Feature Engineering

In [None]:
# Create tenure buckets
df['tenure_bucket'] = pd.cut(df['tenure'], bins=[0, 3, 6, 12, df['tenure'].max()], labels=['0-3 months', '4-6 months', '7-12 months', '12+ months'])

# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
                    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                    'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_bucket']

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Prepare features and target
X = df.drop(['customerID', 'Churn', 'TotalCharges'], axis=1)  # Drop ID and target, TotalCharges might be leaky
y = df['Churn'].map({'Yes': 1, 'No': 0})

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Churn Prediction Model

In [None]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Model Evaluation

In [None]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC-AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.2f}")

# Feature importance (coefficients for Logistic Regression)
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.coef_[0]})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("Top 10 Feature Importances:")
print(feature_importance.head(10))

# Visualizations

In [None]:
# Churn rate by tenure
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='tenure', hue='Churn', multiple='stack', bins=30)
plt.title('Churn Distribution by Tenure')
plt.xlabel('Tenure (months)')
plt.ylabel('Count')
plt.show()

# Monthly charges distribution by churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='Churn', y='MonthlyCharges', data=df)
plt.title('Monthly Charges by Churn Status')
plt.show()

# Churn by contract type (percentage)
contract_churn = df.groupby('Contract')['Churn'].value_counts(normalize=True).unstack()
contract_churn.plot(kind='bar', stacked=True)
plt.title('Churn by Contract Type')
plt.ylabel('Proportion')
plt.show()

# Insights and Recommendations

## Key Findings
- Customers with month-to-month contracts are more likely to churn compared to those with longer-term contracts.
- Higher monthly charges correlate with higher churn rates, indicating price sensitivity.
- Early tenure customers (0-3 months) have the highest churn rate.

## Recommendations
- Encourage customers to switch to longer-term contracts with incentives like discounts.
- Review pricing strategy to ensure competitiveness, especially for high-charge plans.
- Implement targeted onboarding and support for new customers to improve retention in the first few months.