In [None]:
# ============================================
# DATA MINING & BUSINESS INTELLIGENCE
# TELCO CUSTOMER CHURN PREDICTION
# ============================================

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns

# ============================================
# PART A: DATA LOADING & PREPROCESSING
# ============================================

# 1. Load dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Display first 10 rows
print("First 10 rows:")
print(df.head(10))

# 2. Inspect datatypes and missing values
print("\nDataset Info:")
df.info()

print("\nMissing Values:")
print(df.isnull().sum())

# Convert TotalCharges to numeric and handle missing values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# 5. Convert churn to binary
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

# 3. Encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# 4. Standardize numerical variables
scaler = StandardScaler()
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

print("\nPreprocessed dataset shape:", df_encoded.shape)

# ============================================
# PART B: EXPLORATORY DATA ANALYSIS
# ============================================

# 1. Summary statistics
print("\nSummary Statistics:")
print(df[['tenure', 'MonthlyCharges', 'TotalCharges']].describe())

# 2. Histograms
plt.figure()
plt.hist(df['tenure'], bins=30)
plt.title("Tenure Distribution")
plt.xlabel("Tenure")
plt.ylabel("Frequency")
plt.show()

plt.figure()
plt.hist(df['MonthlyCharges'], bins=30)
plt.title("Monthly Charges Distribution")
plt.xlabel("Monthly Charges")
plt.ylabel("Frequency")
plt.show()

# 3. Churn rate by contract type
contract_churn = pd.crosstab(df['Contract'], df['Churn'], normalize='index')
contract_churn.plot(kind='bar')
plt.title("Churn Rate by Contract Type")
plt.ylabel("Proportion")
plt.show()

# 4. Correlation heatmap
plt.figure()
sns.heatmap(df_encoded[num_cols + ['Churn']].corr(), annot=True)
plt.title("Correlation Heatmap")
plt.show()

# ============================================
# PART D: MODELING & EVALUATION
# ============================================

X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{model_name} Results")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

# Evaluate models
evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")

# Feature importance
feature_importance = pd.Series(
    rf.feature_importances_, index=X.columns
).sort_values(ascending=False).head(10)

print("\nTop 10 Features Influencing Churn:")
print(feature_importance)

plt.figure()
feature_importance.plot(kind='bar')
plt.title("Top Features Influencing Churn")
plt.show()

# ============================================
# PART E: DASHBOARD & INSIGHTS
# ============================================

# 1. Churn distribution
plt.figure()
df['Churn'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title("Churn Distribution")
plt.ylabel("")
plt.show()

# 2. Churn by contract type
contract_churn.plot(kind='bar')
plt.title("Churn by Contract Type")
plt.show()

# 3. Scatter plot: Tenure vs Monthly Charges
plt.figure()
plt.scatter(df['tenure'], df['MonthlyCharges'])
plt.xlabel("Tenure")
plt.ylabel("Monthly Charges")
plt.title("Tenure vs Monthly Charges")
plt.show()

# 4. High-risk customer identification
df['Churn_Prediction_RF'] = rf.predict(X)
high_risk_customers = df[df['Churn_Prediction_RF'] == 1].head(10)

print("\nSample High-Risk Customers:")
print(high_risk_customers)
