In [2]:
# Import necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data.
df = pd.read_csv('https://raw.githubusercontent.com/mayankjr98/Customer-Churn-Prediction-Anaysis/refs/heads/main/CustomerChurnTrainingData.csv')

# Display the first 10 rows of the dataframe.
df.head(10)

In [5]:
# Check the shape of the dataframe.
df.shape

(5000, 7)

In [None]:
# Check the data types of the features.
df.dtypes

In [None]:
# Summary statistics.
df.describe()

#### 2. Univariate Analysis:

Moving forward to the univariate analysis, we will generate **histograms for each numeric feature in the dataset** to help us understand their distributions.

In [None]:
# Plot histograms of each numeric feature for univariate analysis.
df.hist(figsize=(10, 10), bins=50)
plt.show()

#### 3. Bivariate Analysis:

For the Bivariate Analysis we will generate a correlation heatmap, allowing us to understand the relationships between different features of the dataset.

In [None]:
# Plot correlation matrix to understand the relationship between features.
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

#### 4. Missing Value Treatment:

For the latest part of the Data Exploration section, we aim to identify the number of missing values in each feature. Depending on the results, we can determine how to handle the missing values.

In [None]:
# Check for missing values.
df.isnull().sum()

#### Feature Engineering

For the Feauture Engineering we will create the ratio of `sum_collect_points` to `sum_redeem_points` as an indicator of customer's tendency to save points.

Then we will interpret the results of this ratio.

In [None]:
# Feature Engineering
df['collect_to_redeem_ratio'] = df['sum_collect_points'] / df['sum_redeem_points']
df['collect_to_redeem_ratio'].replace(np.inf, 0, inplace=True) # Replace any infinity values caused by division by zero.
df['collect_to_redeem_ratio'].fillna(0, inplace=True) # Replace any NaN values resulting from nulls in the data.

# Corrected Python Code
df['collect_to_redeem_ratio'].describe()

In [12]:
df = df.drop(['collect_to_redeem_ratio'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Divide the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(df.drop('state', axis=1), df['state'], test_size=0.2, random_state=556555)

# Initialize the Logistic Regression model.
lr_model = LogisticRegression()

# Fit the model to the training data.
lr_model.fit(X_train, y_train)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# Evaluate the model on the test set (Accuracy).
y_test_pred = lr_model.predict(X_test)
print("Test accuracy: ", accuracy_score(y_test, y_test_pred), "\n")

# Print a classification report.
print("\nClassification Report:\n\n", classification_report(y_test, y_test_pred), "\n")

print("Confusion matrix:")

# Confusion matrix.
cm = confusion_matrix(y_test, y_test_pred)

# Convert confusion matrix to dataframe for better visualization.
cm_df = pd.DataFrame(cm, index = ['Active (Actual)', 'Lapsed (Actual)'],
                     columns = ['Active (Predicted)', 'Lapsed (Predicted)'])

# Display the confusion matrix.
display(cm_df)

# Print an empty line for clarity purposes.
print("\n")

# Print an empty line for clarity purposes.
print("\n")

# Print the confusion matrix.
confusion_mat = confusion_matrix(y_test, y_test_pred)
sns.heatmap(confusion_mat, annot=True, fmt="d", cmap='Reds')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Print an empty line for clarity purposes.
print("\n")

# ROC curve.
# Calculate the probabilities of getting the positive class.
y_scores = lr_model.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_scores)
fpr, tpr, thresholds = roc_curve(y_test, y_scores)

# Plot the ROC curve.
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Print an empty line for clarity purposes.
print("\n")

# Misclassification Rate for LR.
misclassification_rate_lr = (cm[0][1] + cm[1][0]) / cm.sum()
print("\nLogistic Regression Model")
print("Misclassification rate: ", misclassification_rate_lr)

# Print an empty line for clarity purposes.
print("\n")

# Feature Importance.
# Print the feature importance.
importance = lr_model.coef_[0]
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': importance})
feature_importance.sort_values(by='importance', ascending=False)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the decision tree classifier.
dt_model = DecisionTreeClassifier(random_state=556555)

# Fit the model.
dt_model.fit(X_train, y_train)

# Make predictions.
y_dt_pred = dt_model.predict(X_test)

# Evaluation.
print("\nDecision Tree Model")
print("Test accuracy: ", accuracy_score(y_test, y_dt_pred), "\n")
print(classification_report(y_test, y_dt_pred))

# Print an empty line for clarity purposes.
print("\n")

# Print the confusion matrix.
confusion_mat_dt = confusion_matrix(y_test, y_dt_pred)
sns.heatmap(confusion_mat_dt, annot=True, fmt="d", cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion matrix for Decision Tree')
plt.show()

# ROC curve and AUC.
y_scores_dt = dt_model.predict_proba(X_test)[:,1]
roc_auc_dt = roc_auc_score(y_test, y_scores_dt)
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, y_scores_dt)

# Print an empty line for clarity purposes.
print("\n")

plt.figure()
plt.plot(fpr_dt, tpr_dt, label='Decision Tree (area = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Misclassification rate for Decision Tree Model.
misclassification_rate_dt = (confusion_mat_dt[0][1] + confusion_mat_dt[1][0]) / confusion_mat_dt.sum()
print("\nDecision Tree Model")
print("Misclassification rate: ", misclassification_rate_dt)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the random forest classifier.
rf_model = RandomForestClassifier(random_state=556555)

# Fit the model.
rf_model.fit(X_train, y_train)

# Make predictions.
y_rf_pred = rf_model.predict(X_test)

# Evaluation.
print("\nRandom Forest Model")
print("Test accuracy: ", accuracy_score(y_test, y_rf_pred), "\n")
print(classification_report(y_test, y_rf_pred))

# Print an empty line for clarity purposes.
print("\n")

# Print the confusion matrix.
confusion_mat_rf = confusion_matrix(y_test, y_rf_pred)
sns.heatmap(confusion_mat_rf, annot=True, fmt="d", cmap='Greens')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion matrix for Random Forest')
plt.show()

# Print an empty line for clarity purposes.
print("\n")

# ROC curve and AUC.
y_scores_rf = rf_model.predict_proba(X_test)[:,1]
roc_auc_rf = roc_auc_score(y_test, y_scores_rf)
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_scores_rf)

plt.figure()
plt.plot(fpr_rf, tpr_rf, label='Random Forest (area = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Misclassification rate for Random Forest Model.
misclassification_rate_rf = (confusion_mat_rf[0][1] + confusion_mat_rf[1][0]) / confusion_mat_rf.sum()
print("\nRandom Forest Model")
print("Misclassification rate: ", misclassification_rate_rf)

---
### Model Evaluation and Optimization
---

After we have built our initial Logistic Regression model, it is crucial to evaluate its performance and optimize it. We'll use cross-validation for a robust estimate of the model's performance and hyperparameter tuning for model optimization.

In [None]:
# Import necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

# Create a pipeline that scales the data and then runs logistic regression
lr_model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

# Fit the pipeline to the training data
lr_model.fit(X_train, y_train)

# Compute cross-validation score.
cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5)

# Print the cross-validation scores
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

# Define the parameter grid with 'logisticregression' as prefix for the 'C' parameter
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Initialize GridSearchCV
grid_search = GridSearchCV(lr_model, param_grid, cv=5)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters: ", grid_search.best_params_)

In [None]:
# Retrain the model using the best parameters
best_lr_model = make_pipeline(StandardScaler(), LogisticRegression(C=0.001, max_iter=1000))
best_lr_model.fit(X_train, y_train)

# Evaluate the model on the test set (Accuracy).
y_test_pred = best_lr_model.predict(X_test)
print("Test accuracy: ", accuracy_score(y_test, y_test_pred), "\n")

# Print a classification report.
print("\nClassification Report:\n\n", classification_report(y_test, y_test_pred), "\n")

print("Confusion matrix:")

# Confusion matrix.
cm = confusion_matrix(y_test, y_test_pred)

# Convert confusion matrix to dataframe for better visualization.
cm_df = pd.DataFrame(cm, index = ['Active (Actual)', 'Lapsed (Actual)'],
                     columns = ['Active (Predicted)', 'Lapsed (Predicted)'])

# Display the confusion matrix.
display(cm_df)

# Print an empty line for clarity purposes.
print("\n")

# Print the confusion matrix.
confusion_mat = confusion_matrix(y_test, y_test_pred)
sns.heatmap(confusion_mat, annot=True, fmt="d", cmap='Purples')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Print an empty line for clarity purposes.
print("\n")

# ROC curve and AUC.
# Calculate the probabilities of getting the positive class.
y_scores = best_lr_model.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_scores)
fpr, tpr, thresholds = roc_curve(y_test, y_scores)

# Plot the ROC curve.
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Print an empty line for clarity purposes.
print("\n")

# Misclassification Rate for LR.
misclassification_rate_lr = (cm[0][1] + cm[1][0]) / cm.sum()
print("\nLogistic Regression Model")
print("Misclassification rate: ", misclassification_rate_lr)

# Print an empty line for clarity purposes.
print("\n")

# Feature Importance.
# Get the feature importance.
importance = best_lr_model.named_steps['logisticregression'].coef_[0]
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': importance})
feature_importance.sort_values(by='importance', ascending=False)

In [None]:
# Initialize the random forest classifier.
rf_model = RandomForestClassifier(random_state=556555)

# Fit the model.
rf_model.fit(X_train, y_train)

In [20]:
# Make predictions.
y_rf_pred = rf_model.predict(X_test)

In [None]:
# Evaluation.
print("\nRandom Forest Model")
print("Test accuracy: ", accuracy_score(y_test, y_rf_pred), "\n")
print(classification_report(y_test, y_rf_pred))

In [None]:
# Print the confusion matrix.
confusion_mat_rf = confusion_matrix(y_test, y_rf_pred)
sns.heatmap(confusion_mat_rf, annot=True, fmt="d", cmap='Greens')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion matrix for Random Forest')
plt.show()

In [None]:
# ROC curve and AUC.
y_scores_rf = rf_model.predict_proba(X_test)[:,1]
roc_auc_rf = roc_auc_score(y_test, y_scores_rf)
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_scores_rf)

plt.figure()
plt.plot(fpr_rf, tpr_rf, label='Random Forest (area = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Misclassification rate for Random Forest Model.
misclassification_rate_rf = (confusion_mat_rf[0][1] + confusion_mat_rf[1][0]) / confusion_mat_rf.sum()
print("\nRandom Forest Model")
print("Misclassification rate: ", misclassification_rate_rf)

In [None]:
# Get feature importances.
importances = rf_model.feature_importances_

# Convert the importances into a DataFrame.
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importances})

# Sort the DataFrame by importance.
feature_importances.sort_values(by='importance', ascending=False, inplace=True)

# Print the feature importances.
print(feature_importances)

In [None]:
# Visualize the feature importances.
plt.figure(figsize=(12, 4))
bar_plot = sns.barplot(x='importance', y='feature', data=feature_importances)

# Add the values on the bars.
for i in range(feature_importances.shape[0]):
    bar_plot.text(x=feature_importances.importance.iloc[i],
                  y=i,
                  s='{:.2f}'.format(feature_importances.importance.iloc[i]),
                  va='center')

# Add labels and title.
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance from Random Forest')
plt.tight_layout()

# Show the plot.
plt.show()