# PROJECT SCOPE

The scope of this project in this code base is to build an engagement score model to enable businesses know and understand their customers and their level of engagement with the business; here, we will be using the bank churn data, and for this model, we will be focusing on companies in the financial services sector. for this and the remaining model, we will be using the BankChurners.csv file saved in the Datasets folder. However after the model is runned it saved the processed data into its own folder saved in the Dataset folder where it has the data with the Engagement classification score generated based off this model. Where 0 means they arent engaged and 1 means they are 

In [None]:
#Importing the necessary packages
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Data Ingestion
file_path = r'/Users/abduljalaalabubakar/Desktop/Projects/Symply Finance/Customer Insight Model/Fintech Customer Insight Model/Datasets/Bank Churn Dataset/BankChurners.csv'
data = pd.read_csv(file_path)

# Data Investigation
print("\n### Data Investigation ###")
print("Missing Values:\n", data.isnull().sum())
print("\nSample Data:\n", data.head())

# Dropping the unnecessary columns
data.drop(['CLIENTNUM'], axis=1, inplace=True)

# Checking the class distribution for 'Attrition_Flag'
print("\nClass Distribution (%):\n", data['Attrition_Flag'].value_counts(normalize=True) * 100)

# Encoding categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Defining Target Variable
# Creating a target variable for "Engagement Scoring" as a percentage
engagement_raw_score = data['Months_Inactive_12_mon'] + data['Contacts_Count_12_mon'] + data['Total_Relationship_Count']
data['Engagement_Score'] = MinMaxScaler().fit_transform(engagement_raw_score.values.reshape(-1, 1)) * 100

# Displaying Engagement Score Distribution
print("\nEngagement Score Distribution (0-100):\n", data['Engagement_Score'].describe())

# Defining engagement levels using quartiles
data['Engagement_Level'] = pd.qcut(data['Engagement_Score'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

# Mapping the engagement levels to numeric values for classification
engagement_level_mapping = {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}
data['Engagement_Level_Numeric'] = data['Engagement_Level'].map(engagement_level_mapping)

# Data Preprocessing
X = data.drop(['Engagement_Score', 'Engagement_Level', 'Engagement_Level_Numeric'], axis=1)  # Features
y = data['Engagement_Level_Numeric']  # Target

# Handling class imbalance using SMOTE
print("\nBalancing the dataset using SMOTE...")
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Standardizing numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_balanced)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_balanced, test_size=0.3, random_state=42)

# Defining Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=300, multi_class='ovr'),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Neural Network": MLPClassifier(max_iter=300)
}

# Train and Evaluate Models
model_folder_path = r'/Users/abduljalaalabubakar/Desktop/Projects/Symply Finance/Customer Insight Model/Fintech Customer Insight Model/Engagement_Scoring_Best_Models'
results_folder_path = r'/Users/abduljalaalabubakar/Desktop/Projects/Symply Finance/Customer Insight Model/Fintech Customer Insight Model/Datasets/Engagement_Scoring_Results'
os.makedirs(model_folder_path, exist_ok=True)
os.makedirs(results_folder_path, exist_ok=True)

results = {}
best_model = None
best_score = 0

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    if y_prob is not None:
        auc = roc_auc_score(y_test, y_prob, multi_class='ovr')  # Adjust for multiclass AUC
    else:
        auc = None

    # Saving the results
    results[name] = {
        "Accuracy": accuracy,
        "AUC": auc,
        "Classification Report": classification_report(
            y_test, y_pred, target_names=['Low', 'Medium', 'High', 'Very High']
        )
    }

    # Saving the best model
    if accuracy > best_score:
        best_score = accuracy
        best_model = model

# Saving the best model
best_model_path = os.path.join(model_folder_path, "best_engagement_score_model.pkl")
joblib.dump(best_model, best_model_path)
print(f"\nBest model saved to: {best_model_path}")

# Generating the predictions using the best model on the balanced dataset
engagement_level_predictions = best_model.predict(X_scaled)

# Saving the predictions to a new column in the balanced dataset
balanced_data = pd.DataFrame(X_balanced, columns=X.columns)
balanced_data['Engagement_Level'] = y_balanced
balanced_data['Predicted_Engagement_Level'] = engagement_level_predictions
balanced_data['Predicted_Engagement_Level_Label'] = balanced_data['Predicted_Engagement_Level'].map(
    {0: 'Low', 1: 'Medium', 2: 'High', 3: 'Very High'}
)

# Saving tthe dataset with predictions to a new file
output_file_path = os.path.join(results_folder_path, "Balanced_BankChurners_with_Engagement_Score.csv")
balanced_data.to_csv(output_file_path, index=False)
print(f"\nProcessed data with predictions saved to: {output_file_path}")

# Displaying the results
print("\n### Model Evaluation Results ###")
for name, metrics in results.items():
    print(f"\nModel: {name}")
    print(f"Accuracy: {metrics['Accuracy']}")
    if metrics["AUC"]:
        print(f"AUC: {metrics['AUC']}")
    print(f"Classification Report:\n{metrics['Classification Report']}")


### Data Investigation ###
Missing Values:
 CLIENTNUM                                                                                                                             0
Attrition_Flag                                                                                                                        0
Customer_Age                                                                                                                          0
Gender                                                                                                                                0
Dependent_count                                                                                                                       0
Education_Level                                                                                                                       0
Marital_Status                                                                                                                        0
Inc



Training Gradient Boosting...
Training Neural Network...
