In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

In [None]:
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

try:
    df = pd.read_csv(url)
    print(f" Dataset loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")
except:
    print(" Could not load from URL")
 
print("\nFirst 3 rows:")
print(df.head(3))

print("\nDataset Info:")
print(df.info())


In [None]:
print("\nTarget Distribution:")
print(df['Churn'].value_counts())
print(f"Churn Rate: {df['Churn'].value_counts(normalize=True)['Yes']*100:.2f}%")

In [None]:
# Drop customerID
df = df.drop('customerID', axis=1)


In [None]:
# Handle missing values
print(f"Missing values:\n{df.isnull().sum()[df.isnull().sum() > 0]}")

In [None]:
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Encode target variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


In [None]:
# Identify feature types
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = [col for col in df.columns if col not in numeric_features + ['Churn']]

In [None]:

print(f"\nNumeric features ({len(numeric_features)}): {numeric_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features[:5]}...")

In [None]:
# Create new features
df['ChargesPerMonth'] = df['TotalCharges'] / (df['tenure'] + 1)  # Avoid division by zero
df['IsNewCustomer'] = (df['tenure'] <= 12).astype(int)
df['HasMultipleServices'] = ((df['InternetService'] != 'No') & 
                               (df['PhoneService'] == 'Yes')).astype(int)

# Add new features to numeric list
numeric_features.extend(['ChargesPerMonth', 'IsNewCustomer', 'HasMultipleServices'])

print(f" Created {3} new features")
print(f"Total features: {len(numeric_features) + len(categorical_features)}")

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Feature columns: {X_train.shape[1]}")

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Create preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])



In [None]:
# Pipeline 1: Logistic Regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Pipeline 2: Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

In [None]:
# Logistic Regression parameter grid
lr_param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs']
}

# Random Forest parameter grid
rf_param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5]
}

# GridSearch for Logistic Regression
print("\nTuning Logistic Regression...")
lr_grid = GridSearchCV(
    lr_pipeline, 
    lr_param_grid, 
    cv=5, 
    scoring='f1',
    n_jobs=-1,
    verbose=0
)
lr_grid.fit(X_train, y_train)


In [None]:
# GridSearch for Random Forest
print("\nTuning Random Forest...")
rf_grid = GridSearchCV(
    rf_pipeline, 
    rf_param_grid, 
    cv=5, 
    scoring='f1',
    n_jobs=-1,
    verbose=0
)

In [None]:
rf_grid.fit(X_train, y_train)
print(f" Best RF params: {rf_grid.best_params_}")
print(f" Best RF CV F1-score: {rf_grid.best_score_:.4f}")

In [None]:
def evaluate_model(model, X_test, y_test, model_name):

    y_pred = model.predict(X_test)
    
    print(f"{model_name} - Test Set Results")
    
    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
    print(f"F1-Score:  {f1_score(y_test, y_pred):.4f}")
    
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))
    
    return y_preds

In [None]:
# Evaluate both models
lr_pred = evaluate_model(lr_grid.best_estimator_, X_test, y_test, "Logistic Regression")
rf_pred = evaluate_model(rf_grid.best_estimator_, X_test, y_test, "Random Forest")

# Compare models
lr_f1 = f1_score(y_test, lr_pred)
rf_f1 = f1_score(y_test, rf_pred)

best_model_name = "Random Forest" if rf_f1 > lr_f1 else "Logistic Regression"
best_model = rf_grid.best_estimator_ if rf_f1 > lr_f1 else lr_grid.best_estimator_

print(f"WINNER: {best_model_name} (F1-Score: {max(lr_f1, rf_f1):.4f})")


In [None]:
if 'Random Forest' in best_model_name:
    # Get feature names after preprocessing
    feature_names = (numeric_features + 
                     list(rf_grid.best_estimator_.named_steps['preprocessor']
                          .named_transformers_['cat']
                          .named_steps['onehot']
                          .get_feature_names_out(categorical_features)))
    
    # Get feature importances
    importances = rf_grid.best_estimator_.named_steps['classifier'].feature_importances_
    
    # Create dataframe and sort
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance_df.head(10).to_string(index=False))

In [None]:
import os 

# Save the best model
pipeline_filename = 'churn_prediction_pipeline.pkl'
joblib.dump(best_model, pipeline_filename)

file_size = os.path.getsize(pipeline_filename) / (1024 * 1024)  # Convert to MB

# Save preprocessing info
preprocessing_info = {
    'numeric_features': numeric_features,
    'categorical_features': categorical_features,
    'model_type': best_model_name,
    'test_f1_score': max(lr_f1, rf_f1)
}
joblib.dump(preprocessing_info, 'preprocessing_info.pkl')


In [None]:
# Load pipeline
loaded_pipeline = joblib.load(pipeline_filename)

# Test on a sample
sample_data = X_test.iloc[:5]
predictions = loaded_pipeline.predict(sample_data)
probabilities = loaded_pipeline.predict_proba(sample_data)

print("\nSample Predictions:")
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
    print(f"Customer {i+1}: {'CHURN' if pred == 1 else 'NO CHURN'} "
          f"(Probability: {prob[1]:.2%})")
