Assignment 3 - Support Vector Machines

In [1]:
file_path = "C:/Users/johne/Downloads/Electronic_sales_Sep2023-Sep2024.csv"

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
sales_data = pd.read_csv(file_path)

# Data Preprocessing:

# Implementing Customer Segmentation into the model

# Extract Customer Lifetime Value (CLV) by summing the Total Price for each customer
sales_data['Customer_Lifetime_Value'] = sales_data.groupby('Customer ID')['Total Price'].transform('sum')

# Extract Purchase Frequency by counting the number of purchases for each customer
sales_data['Purchase_Frequency'] = sales_data.groupby('Customer ID')['Customer ID'].transform('count')

# Verifying the new columns
sales_data[['Customer ID', 'Customer_Lifetime_Value', 'Purchase_Frequency']].head()


# Create a binary outcome variable for loyalty membership ('Yes' -> 1, 'No' -> 0)
sales_data['Loyalty Member'] = sales_data['Loyalty Member'].apply(lambda x: 1 if x == 'Yes' else 0)

# Select features
X = sales_data[['Age', 'Quantity','Purchase_Frequency','Customer_Lifetime_Value']]  # You can add more features here if needed
y = sales_data['Loyalty Member']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Initialize the Support Vector Classifier (SVC) with balanced class weights
svm_model_balanced = SVC(kernel='linear', class_weight='balanced', random_state=42, probability=True)

# Train the model
svm_model_balanced.fit(X_train_res, y_train_res)

# Make predictions on the test set and predict probabilities for threshold adjustment
y_pred_proba = svm_model_balanced.predict_proba(X_test)[:, 1]

# Set a custom threshold to balance the classes (try 0.5 first)
threshold = 0.4
y_pred_custom_threshold = (y_pred_proba >= threshold).astype(int)

# Evaluate the model performance with the custom threshold
conf_matrix_custom = confusion_matrix(y_test, y_pred_custom_threshold)
class_report_custom = classification_report(y_test, y_pred_custom_threshold, zero_division=1)

# Output the confusion matrix and classification report
print("Confusion Matrix:\n", conf_matrix_custom)
print("\nClassification Report:\n", class_report_custom)

# Visualize the performance - ROC Curve and Confusion Matrix Heatmap

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Confusion Matrix Heatmap
sns.heatmap(conf_matrix_custom, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
