<a href="https://colab.research.google.com/github/omdgn/Ecommerce-_recommendation-_system/blob/main/commerce_rec_machine_learning_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Step 1: Load the Dataset
file_path = '/content/shopping_trends_updated.csv'
df = pd.read_csv(file_path)

# Step 2: Problem Type and Missing Data Handling
# Problem Type: Classification (Recommended / Not Recommended)
selected_columns = [
    'Previous Purchases', 'Review Rating', 'Frequency of Purchases',
    'Season', 'Category', 'Item Purchased', 'Purchase Amount (USD)', 'Subscription Status'
]

# Select relevant columns
df_selected = df[selected_columns]

# Check missing values
missing_values = df_selected.isnull().sum()
print("Missing Values Before Handling:\n", missing_values)

# Step 3: Encode Categorical Data and Create Labels
# Apply One-Hot Encoding
encoded_columns = ['Frequency of Purchases', 'Season', 'Category', 'Item Purchased', 'Subscription Status']
df_encoded = pd.get_dummies(df_selected, columns=encoded_columns, drop_first=True)

# Create labels ('Recommended' or 'Not Recommended')
df_encoded['Recommended'] = (
    (df_selected['Review Rating'] > 3.0) &
    (df_selected['Previous Purchases'] > 10) &
    (df_selected['Purchase Amount (USD)'] > 50)
).astype(int)

# Convert labels to descriptive values
df_encoded['Recommended'] = df_encoded['Recommended'].map({1: "Recommended", 0: "Not Recommended"})

# Normalize numerical data
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df_encoded.drop(columns=['Recommended']))
df_encoded_scaled = pd.DataFrame(scaled_features, columns=df_encoded.columns[:-1])
df_encoded_scaled['Recommended'] = df_encoded['Recommended']

# Step 4: Split the Dataset
X = df_encoded_scaled.drop(columns=['Recommended'])
y = df_encoded_scaled['Recommended']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training Set Size: {X_train.shape}")
print(f"Test Set Size: {X_test.shape}\n")

# Step 5: Train and Evaluate Models
# Logistic Regression
lr_model = LogisticRegression()
start_time = time.time()
lr_model.fit(X_train, y_train)
lr_train_time = time.time() - start_time

# Predictions and Evaluation
y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))
print(f"Training Time: {lr_train_time:.4f} seconds\n")

# SVM with RBF kernel
svm_model = SVC(probability=True, kernel='rbf')
start_time = time.time()
svm_model.fit(X_train, y_train)
svm_train_time = time.time() - start_time

# Predictions and Evaluation
y_pred_svm = svm_model.predict(X_test)

print("SVM Performance:")
print(classification_report(y_test, y_pred_svm))
print(f"Training Time: {svm_train_time:.4f} seconds\n")

# K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier(n_neighbors=3)
start_time = time.time()
knn_model.fit(X_train, y_train)
knn_train_time = time.time() - start_time

# Predictions and Evaluation
y_pred_knn = knn_model.predict(X_test)

print("K-Nearest Neighbors (KNN) Performance:")
print(classification_report(y_test, y_pred_knn))
print(f"Training Time: {knn_train_time:.4f} seconds\n")

# Step 6: Performance Comparison
print("\n--- Performance Comparison ---")
comparison_metrics = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1-Score"],
    "Logistic Regression": [
        accuracy_score(y_test, y_pred_lr),
        precision_score(y_test, y_pred_lr, pos_label="Recommended"),
        recall_score(y_test, y_pred_lr, pos_label="Recommended"),
        f1_score(y_test, y_pred_lr, pos_label="Recommended")
    ],
    "SVM": [
        accuracy_score(y_test, y_pred_svm),
        precision_score(y_test, y_pred_svm, pos_label="Recommended"),
        recall_score(y_test, y_pred_svm, pos_label="Recommended"),
        f1_score(y_test, y_pred_svm, pos_label="Recommended")
    ],
    "KNN": [
        accuracy_score(y_test, y_pred_knn),
        precision_score(y_test, y_pred_knn, pos_label="Recommended"),
        recall_score(y_test, y_pred_knn, pos_label="Recommended"),
        f1_score(y_test, y_pred_knn, pos_label="Recommended")
    ]
}
comparison_df = pd.DataFrame(comparison_metrics)
print(comparison_df)


Missing Values Before Handling:
 Previous Purchases        0
Review Rating             0
Frequency of Purchases    0
Season                    0
Category                  0
Item Purchased            0
Purchase Amount (USD)     0
Subscription Status       0
dtype: int64
Training Set Size: (3120, 40)
Test Set Size: (780, 40)

Logistic Regression Performance:
                 precision    recall  f1-score   support

Not Recommended       0.84      0.89      0.86       485
    Recommended       0.80      0.72      0.76       295

       accuracy                           0.83       780
      macro avg       0.82      0.80      0.81       780
   weighted avg       0.82      0.83      0.82       780

Training Time: 0.0317 seconds

SVM Performance:
                 precision    recall  f1-score   support

Not Recommended       0.87      0.92      0.89       485
    Recommended       0.85      0.78      0.81       295

       accuracy                           0.86       780
      macro avg   