# Customer Churn Prediction
## Data Analysis, Feature Engineering, Model Development, and Model Saving

In [None]:
# Step 1: Load Libraries and Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import pickle

# Load dataset
data = pd.read_csv('data.csv')
data.head()

In [None]:
# Step 2: Feature Engineering
data['income_per_year'] = data['annual_income'] / (data['years_with_company'] + 1)
data['purchases_per_year'] = data['total_purchases'] / (data['years_with_company'] + 1)
data['spend_per_purchase'] = data['total_spent'] / (data['total_purchases'] + 1)

# Encode gender
data['gender'] = data['gender'].apply(lambda x: 1 if x == 'Male' else 0)

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(data[['income_per_year', 'purchases_per_year', 'spend_per_purchase']])
poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out())
data = pd.concat([data, poly_df], axis=1)

data.drop(columns=['customer_id'], inplace=True)
data.head()

In [None]:
# Step 3: Train-Test Split
X = data.drop(columns=['is_active'])
y = data['is_active']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 4: Model Training with Hyperparameter Tuning
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
lr_preds = lr.predict(X_test_scaled)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='accuracy')
rf_grid.fit(X_train, y_train)

# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb_params = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1]}
gb_grid = GridSearchCV(gb, gb_params, cv=3, scoring='accuracy')
gb_grid.fit(X_train, y_train)

In [None]:
# Step 5: Model Evaluation
models = {'Logistic Regression': lr, 'Random Forest': rf_grid.best_estimator_, 'Gradient Boosting': gb_grid.best_estimator_}
for name, model in models.items():
    preds = model.predict(X_test_scaled)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, preds)}")
    print(f"ROC AUC: {roc_auc_score(y_test, preds)}")
    print(classification_report(y_test, preds))
    print("-" * 50)

In [None]:
# Step 6: Save the Best Model
best_model = gb_grid.best_estimator_  # Assuming Gradient Boosting performed best
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
print("Best model saved as best_model.pkl")