<a href="https://colab.research.google.com/github/muajnstu/Customer_Churn_Prediction/blob/main/Customer_Churn_Prediction_Using_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.feature_selection import mutual_info_classif, SelectKBest, f_classif




In [None]:
# Load the uploaded file
df = pd.read_csv("https://raw.githubusercontent.com/muajnstu/Customer_Churn_Prediction/refs/heads/main/preprocessed_customer_churn_dataset_.csv")
df

Unnamed: 0,Age,Gender,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Promotion_Response,Target_Churn
0,62,0,45.15,5892.58,5,22,453.80,2,0,3,129,True,0,True
1,65,1,79.51,9025.47,13,77,22.90,2,2,3,227,False,0,False
2,18,1,29.19,618.83,13,71,50.53,5,2,2,283,False,0,True
3,21,0,79.63,9110.30,3,33,411.83,5,3,5,226,True,1,True
4,21,0,77.66,5390.88,15,43,101.19,3,0,5,242,False,2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,54,1,143.72,1089.09,2,29,77.75,0,3,2,88,True,1,False
996,19,1,164.19,3700.24,9,90,34.45,6,4,4,352,False,0,True
997,47,2,113.31,705.85,17,69,187.37,7,3,1,172,True,2,False
998,23,1,72.98,3891.60,7,31,483.80,1,2,5,55,False,0,True


In [None]:
X = df.drop(columns=['Target_Churn'])
y = df['Target_Churn']

In [None]:
def drop_extreme_correlation_columns(df, target_column, lower_threshold=0.01, upper_threshold=0.01):

  correlation_with_target = df.corr('spearman')[target_column].abs()
  correlation_with_target = correlation_with_target.drop(target_column) # Exclude the target itself

  columns_to_drop = correlation_with_target[
      (correlation_with_target < lower_threshold) | (correlation_with_target > upper_threshold)
  ].index.tolist()

  if columns_to_drop:
    print(f"Columns being considered for dropping based on thresholds ({lower_threshold} and {upper_threshold}): {columns_to_drop}")
    df_dropped = df.drop(columns=columns_to_drop)
    return df_dropped
  else:
    print("No columns meet the criteria for dropping based on the thresholds.")
    return df.copy()

def subdataset_by_kbest(data, target_column, k=5):

    if target_column not in data.columns:
        raise ValueError(f"Dataset must contain a '{target_column}' column as the target variable.")

    # Apply SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X, y)

    # Get selected feature names
    selected_features = X.columns[selector.get_support()]

    # Create new DataFrame with selected features and target
    reduced_data = X[selected_features].copy()
    reduced_data[target_column] = y.values  # Add target back

    return reduced_data

def drop_low_mi_features(df, target_column, n_features_to_keep):

    # Features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # mutual_info_classif is suitable for discrete target variables
    mi_scores = mutual_info_classif(X, y)

    # Create a Series for easier viewing
    mi_series = pd.Series(mi_scores, index=X.columns)

    # Sort the scores in descending order
    mi_series = mi_series.sort_values(ascending=False)

    # Get the names of the features to keep
    features_to_keep = mi_series.head(n_features_to_keep).index.tolist()

    # Add the target column back
    if target_column not in features_to_keep:
        features_to_keep.append(target_column)

    # Drop features not in the list of features to keep
    df_3 = df[features_to_keep]

    print(f"\nMutual Information Scores (Top {n_features_to_keep}):")
    print(mi_series.head(n_features_to_keep))
    print(f"\nDataFrame after dropping features with low MI scores (keeping top {n_features_to_keep}):")
    print(df_2.head())

    return df_3


# Feature Transformation

def minmax_scaler_func(X):
    minmax_scaler = MinMaxScaler()
    X_minmax_scaled = minmax_scaler.fit_transform(X)
    df_minmax_scaled = pd.DataFrame(X_minmax_scaled, columns=X.columns, index=X.index)
    df_minmax_scaled['Target_Churn'] = y
    return df_minmax_scaled

def standard_scaler_func(X):
    standard_scaler = StandardScaler()
    X_standard_scaled = standard_scaler.fit_transform(X)
    df_standard_scaled = pd.DataFrame(X_standard_scaled, columns=X.columns, index=X.index)
    df_standard_scaled['Target_Churn'] = y
    return df_standard_scaled

def log_transformer_func(X):
    # Using np.log1p which is log(1+x) to handle potential zero values
    X_log_transformed = np.log1p(X)
    df_log_transformed = pd.DataFrame(X_log_transformed, columns=X.columns, index=X.index)
    df_log_transformed['Target_Churn'] = y
    return df_log_transformed


In [None]:
def split_train_evaluate(dataframe, target_column, test_size=0.25, random_state=42, verbose=True):

    # Split the data into features (X) and the target variable (y)
    X = dataframe.drop(columns=[target_column])
    y = dataframe[target_column]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Define models
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Support Vector Machine": SVC(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Gaussian Naive Bayes": GaussianNB()
    }

    # Store results
    results = {}

    for name, model in models.items():
        if verbose:
            print(f"\nTraining {name}...")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Accuracy
        accuracy = accuracy_score(y_test, y_pred)

        # Classification Report
        report = classification_report(y_test, y_pred, zero_division=0)

        # Store model results
        results[name] = {
            'accuracy': accuracy,
            'classification_report': report
        }

        # Print results
        if verbose:
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Classification Report:\n{report}")

    return results


In [None]:

df_1 = drop_extreme_correlation_columns(df.copy(), target_column="Target_Churn", lower_threshold=0.01, upper_threshold=0.8)
df_1.head()



Columns being considered for dropping based on thresholds (0.01 and 0.8): ['Age', 'Num_of_Returns', 'Num_of_Support_Contacts', 'Email_Opt_In']


Unnamed: 0,Gender,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Satisfaction_Score,Last_Purchase_Days_Ago,Promotion_Response,Target_Churn
0,0,45.15,5892.58,5,22,453.8,3,129,0,True
1,1,79.51,9025.47,13,77,22.9,3,227,0,False
2,1,29.19,618.83,13,71,50.53,2,283,0,True
3,0,79.63,9110.3,3,33,411.83,5,226,1,True
4,0,77.66,5390.88,15,43,101.19,5,242,2,False


In [None]:
df_2=subdataset_by_kbest(df, target_column='Target_Churn')
df_2.head()

Unnamed: 0,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Target_Churn
0,45.15,5892.58,5,22,453.8,True
1,79.51,9025.47,13,77,22.9,False
2,29.19,618.83,13,71,50.53,True
3,79.63,9110.3,3,33,411.83,True
4,77.66,5390.88,15,43,101.19,False


In [None]:
df_3 = drop_low_mi_features(df, 'Target_Churn', 3)


Mutual Information Scores (Top 3):
Last_Purchase_Days_Ago    0.036439
Num_of_Returns            0.035312
Satisfaction_Score        0.012508
dtype: float64

DataFrame after dropping features with low MI scores (keeping top 3):
   Annual_Income  Total_Spend  Years_as_Customer  Num_of_Purchases  \
0          45.15      5892.58                  5                22   
1          79.51      9025.47                 13                77   
2          29.19       618.83                 13                71   
3          79.63      9110.30                  3                33   
4          77.66      5390.88                 15                43   

   Average_Transaction_Amount  Target_Churn  
0                      453.80          True  
1                       22.90         False  
2                       50.53          True  
3                      411.83          True  
4                      101.19         False  


In [None]:
df_minmax_scaled = minmax_scaler_func(X)
df_standard_scaled = standard_scaler_func(X)
df_log_transformed = log_transformer_func(X)

In [None]:
#MinMaxScaler

df_1_minmax_scaled = minmax_scaler_func(df_1)
df_2_minmax_scaled = minmax_scaler_func(df_2)
df_3_minmax_scaled = minmax_scaler_func(df_3)

#StandarScaler

df_1_standard_scaled = standard_scaler_func(df_1)
df_2_standard_scaled = standard_scaler_func(df_2)
df_3_standard_scaled = standard_scaler_func(df_3)

#LogTransformation

df_1_log_transformed = log_transformer_func(df_1)
df_2_log_transformed = log_transformer_func(df_2)
df_3_log_transformed = log_transformer_func(df_3)

#Generate Results

In [None]:
results=split_train_evaluate(df_1_standard_scaled,'Target_Churn')


Training Logistic Regression...
Accuracy: 0.5120
Classification Report:
              precision    recall  f1-score   support

       False       0.47      0.22      0.30       119
        True       0.52      0.78      0.63       131

    accuracy                           0.51       250
   macro avg       0.50      0.50      0.46       250
weighted avg       0.50      0.51      0.47       250


Training Random Forest...
Accuracy: 0.4920
Classification Report:
              precision    recall  f1-score   support

       False       0.46      0.43      0.45       119
        True       0.51      0.55      0.53       131

    accuracy                           0.49       250
   macro avg       0.49      0.49      0.49       250
weighted avg       0.49      0.49      0.49       250


Training Support Vector Machine...
Accuracy: 0.5480
Classification Report:
              precision    recall  f1-score   support

       False       0.54      0.35      0.43       119
        True       0.

In [None]:
results=split_train_evaluate(df_1_minmax_scaled, 'Target_Churn')


Training Logistic Regression...
Accuracy: 0.5080
Classification Report:
              precision    recall  f1-score   support

       False       0.46      0.21      0.29       119
        True       0.52      0.78      0.62       131

    accuracy                           0.51       250
   macro avg       0.49      0.49      0.46       250
weighted avg       0.49      0.51      0.46       250


Training Random Forest...
Accuracy: 0.4960
Classification Report:
              precision    recall  f1-score   support

       False       0.47      0.43      0.45       119
        True       0.52      0.56      0.54       131

    accuracy                           0.50       250
   macro avg       0.49      0.49      0.49       250
weighted avg       0.49      0.50      0.49       250


Training Support Vector Machine...
Accuracy: 0.5160
Classification Report:
              precision    recall  f1-score   support

       False       0.49      0.32      0.39       119
        True       0.

In [None]:
results=split_train_evaluate(df_1_log_transformed, 'Target_Churn')


Training Logistic Regression...
Accuracy: 0.5200
Classification Report:
              precision    recall  f1-score   support

       False       0.49      0.21      0.29       119
        True       0.53      0.80      0.64       131

    accuracy                           0.52       250
   macro avg       0.51      0.51      0.47       250
weighted avg       0.51      0.52      0.47       250


Training Random Forest...
Accuracy: 0.5080
Classification Report:
              precision    recall  f1-score   support

       False       0.48      0.45      0.47       119
        True       0.53      0.56      0.54       131

    accuracy                           0.51       250
   macro avg       0.51      0.51      0.51       250
weighted avg       0.51      0.51      0.51       250


Training Support Vector Machine...
Accuracy: 0.5240
Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       119
        True       0.

In [None]:
results=split_train_evaluate(df_2_standard_scaled, 'Target_Churn')


Training Logistic Regression...
Accuracy: 0.5160
Classification Report:
              precision    recall  f1-score   support

       False       0.48      0.22      0.30       119
        True       0.53      0.79      0.63       131

    accuracy                           0.52       250
   macro avg       0.50      0.50      0.47       250
weighted avg       0.50      0.52      0.47       250


Training Random Forest...
Accuracy: 0.4720
Classification Report:
              precision    recall  f1-score   support

       False       0.44      0.40      0.42       119
        True       0.50      0.53      0.51       131

    accuracy                           0.47       250
   macro avg       0.47      0.47      0.47       250
weighted avg       0.47      0.47      0.47       250


Training Support Vector Machine...
Accuracy: 0.5000
Classification Report:
              precision    recall  f1-score   support

       False       0.45      0.23      0.30       119
        True       0.

In [None]:
results=split_train_evaluate(df_2_minmax_scaled, 'Target_Churn')


Training Logistic Regression...
Accuracy: 0.5080
Classification Report:
              precision    recall  f1-score   support

       False       0.46      0.18      0.26       119
        True       0.52      0.80      0.63       131

    accuracy                           0.51       250
   macro avg       0.49      0.49      0.45       250
weighted avg       0.49      0.51      0.46       250


Training Random Forest...
Accuracy: 0.4760
Classification Report:
              precision    recall  f1-score   support

       False       0.45      0.41      0.43       119
        True       0.50      0.53      0.52       131

    accuracy                           0.48       250
   macro avg       0.47      0.47      0.47       250
weighted avg       0.47      0.48      0.47       250


Training Support Vector Machine...
Accuracy: 0.5000
Classification Report:
              precision    recall  f1-score   support

       False       0.45      0.23      0.30       119
        True       0.

In [None]:
results=split_train_evaluate(df_2_log_transformed, 'Target_Churn')


Training Logistic Regression...
Accuracy: 0.5400
Classification Report:
              precision    recall  f1-score   support

       False       0.56      0.17      0.26       119
        True       0.54      0.88      0.67       131

    accuracy                           0.54       250
   macro avg       0.55      0.52      0.46       250
weighted avg       0.55      0.54      0.47       250


Training Random Forest...
Accuracy: 0.4960
Classification Report:
              precision    recall  f1-score   support

       False       0.47      0.43      0.45       119
        True       0.52      0.56      0.54       131

    accuracy                           0.50       250
   macro avg       0.49      0.49      0.49       250
weighted avg       0.49      0.50      0.49       250


Training Support Vector Machine...
Accuracy: 0.5400
Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.03      0.07       119
        True       0.

In [None]:
results=split_train_evaluate(df_3_standard_scaled, 'Target_Churn')


Training Logistic Regression...
Accuracy: 0.5080
Classification Report:
              precision    recall  f1-score   support

       False       0.36      0.04      0.08       119
        True       0.52      0.93      0.66       131

    accuracy                           0.51       250
   macro avg       0.44      0.49      0.37       250
weighted avg       0.44      0.51      0.38       250


Training Random Forest...
Accuracy: 0.5240
Classification Report:
              precision    recall  f1-score   support

       False       0.50      0.46      0.48       119
        True       0.54      0.58      0.56       131

    accuracy                           0.52       250
   macro avg       0.52      0.52      0.52       250
weighted avg       0.52      0.52      0.52       250


Training Support Vector Machine...
Accuracy: 0.5640
Classification Report:
              precision    recall  f1-score   support

       False       0.56      0.37      0.45       119
        True       0.

In [None]:
results=split_train_evaluate(df_3_minmax_scaled, 'Target_Churn')


Training Logistic Regression...
Accuracy: 0.5080
Classification Report:
              precision    recall  f1-score   support

       False       0.33      0.03      0.06       119
        True       0.52      0.94      0.67       131

    accuracy                           0.51       250
   macro avg       0.43      0.49      0.36       250
weighted avg       0.43      0.51      0.38       250


Training Random Forest...
Accuracy: 0.5320
Classification Report:
              precision    recall  f1-score   support

       False       0.51      0.48      0.49       119
        True       0.55      0.58      0.57       131

    accuracy                           0.53       250
   macro avg       0.53      0.53      0.53       250
weighted avg       0.53      0.53      0.53       250


Training Support Vector Machine...
Accuracy: 0.5560
Classification Report:
              precision    recall  f1-score   support

       False       0.55      0.38      0.45       119
        True       0.

In [None]:
results=split_train_evaluate(df_3_log_transformed, 'Target_Churn')


Training Logistic Regression...
Accuracy: 0.4960
Classification Report:
              precision    recall  f1-score   support

       False       0.40      0.12      0.18       119
        True       0.51      0.84      0.64       131

    accuracy                           0.50       250
   macro avg       0.46      0.48      0.41       250
weighted avg       0.46      0.50      0.42       250


Training Random Forest...
Accuracy: 0.5160
Classification Report:
              precision    recall  f1-score   support

       False       0.49      0.46      0.48       119
        True       0.54      0.56      0.55       131

    accuracy                           0.52       250
   macro avg       0.51      0.51      0.51       250
weighted avg       0.51      0.52      0.51       250


Training Support Vector Machine...
Accuracy: 0.5080
Classification Report:
              precision    recall  f1-score   support

       False       0.42      0.09      0.15       119
        True       0.