In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from joblib import dump

In [5]:
# Step 1: Load the dataset (assuming it’s already preprocessed, else preprocessing can be done)
# Load your dataset - replace with actual path or loaded DataFrame
df = pd.read_csv('data/customer_segmentation.csv')

In [6]:
# Step 2: Preprocessing
# Encoding categorical variables
label_encoders = {}
categorical_cols = ['Gender', 'Income_Level', 'Occupation', 'Marital_Status', 'Education_Level',
                    'Location', 'Policy_Type', 'Claim_History', 'Payment_Preferences',
                    'Marketing_Engagement', 'Customer_Feedback', 'Risk_Appetite']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [7]:
# Define the target variable: Renewal_Behavior (replace with the correct column if necessary)
X = df.drop(['Renewal_Behavior'], axis=1)
y = df['Renewal_Behavior']

In [8]:
# Convert target into binary if necessary (e.g., Early, On Time = 1; Late = 0)
le_target = LabelEncoder()
y = le_target.fit_transform(y)

In [9]:
# Step 3: Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Step 4: Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Step 5: Train five models
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "SVC": SVC(probability=True, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42)
}

In [12]:
model_scores = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Save model scores
    model_scores[name] = {'accuracy': accuracy, 'f1_score': f1}
    print(f"{name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

RandomForest - Accuracy: 0.3130, F1 Score: 0.3126
LogisticRegression - Accuracy: 0.3250, F1 Score: 0.3189
KNeighbors - Accuracy: 0.3350, F1 Score: 0.3286
SVC - Accuracy: 0.3380, F1 Score: 0.3355
DecisionTree - Accuracy: 0.3410, F1 Score: 0.3412


In [13]:
# Step 6: Select the best model based on F1 score or accuracy
best_model_name = max(model_scores, key=lambda name: model_scores[name]['f1_score'])
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} with F1 Score: {model_scores[best_model_name]['f1_score']:.4f}")


Best Model: DecisionTree with F1 Score: 0.3412


In [16]:
# Step 7: Save the best model to the 'models' folder
model_path = f"model/{best_model_name}.joblib"
dump(best_model, model_path)
print(f"Best model saved to {model_path}")

Best model saved to model/DecisionTree.joblib
