Data Collection

In [None]:
import pandas as pd
df = pd.read_csv('Telco-Customer-Churn.csv')
print(df.head())
print(df.info())

Feature Engineering

In [None]:
# Clean TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Clean 'No internet service' and 'No phone service'
columns_to_clean = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                    'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in columns_to_clean:
    df[col] = df[col].replace({'No internet service':'No', 'No phone service':'No'})

# Feature: Average Monthly Spend
df['AvgMonthlySpend'] = df['TotalCharges'] / df['tenure'].replace(0, 1)

# Feature: Tenure Grouping
def tenure_group(tenure):
    if tenure <= 6:
        return 'New'
    elif tenure <= 24:
        return 'Mid-term'
    else:
        return 'Long-term'
df['TenureGroup'] = df['tenure'].apply(tenure_group)

# Encoding Binary Variables
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn',
               'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in binary_cols:
    df[col] = df[col].map({'Yes':1, 'No':0, 'Male':0, 'Female':1})

# One-Hot Encoding
df = pd.get_dummies(df, columns=['InternetService', 'Contract', 'PaymentMethod', 'TenureGroup'], drop_first=True)

# Drop Irrelevant Column
df.drop('customerID', axis=1, inplace=True)

# Scale Important Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlySpend']] = scaler.fit_transform(
    df[['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlySpend']]
)

print("Feature Engineering Complete")


Model Selection 

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Churn', axis=1)
y = df['Churn']


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)


rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

print("Models Trained Successfully")


Model Evaluation

In [None]:
from sklearn.metrics import classification_report, accuracy_score


y_pred_lr = lr.predict(X_test)
print("\nLogistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


y_pred_rf = rf.predict(X_test)
print("\nRandom Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Churn Prediction and Action

In [None]:

churn_probabilities = rf.predict_proba(X_test)[:,1]


results = X_test.copy()
results['Actual_Churn'] = y_test.values
results['Predicted_Probability'] = churn_probabilities


high_risk_customers = results[results['Predicted_Probability'] > 0.6]
print("High Risk Customers Detected:\n", high_risk_customers.head())


Download the churn predicted excel file

In [None]:
results.to_csv('churn_predictions.csv', index=False)

Feature Importance for Power Bi visualization 

In [13]:
import pandas as pd
import matplotlib.pyplot as plt

importances = rf.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df.sort_values(by='Importance', ascending=False).to_csv('feature_importance.csv', index=False)
