In [2]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt


data = pd.read_csv('shopping_trends_updated.csv')

data.head()



Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [4]:
data.describe()

Unnamed: 0,Customer ID,Age,Purchase Amount (USD),Review Rating,Previous Purchases
count,3900.0,3900.0,3900.0,3900.0,3900.0
mean,1950.5,44.068462,59.764359,3.749949,25.351538
std,1125.977353,15.207589,23.685392,0.716223,14.447125
min,1.0,18.0,20.0,2.5,1.0
25%,975.75,31.0,39.0,3.1,13.0
50%,1950.5,44.0,60.0,3.7,25.0
75%,2925.25,57.0,81.0,4.4,38.0
max,3900.0,70.0,100.0,5.0,50.0


In [64]:

y = data['Item Purchased']  
X = data.drop('Item Purchased', axis=1)  
# Apply one-hot encoding to the features only
X_encoded = pd.get_dummies(X, drop_first=False)
X_encoded = X_encoded.drop('Customer ID', axis=1, errors='ignore')

In [65]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()

# Fit the label encoder and return encoded labels
y_encoded = label_encoder.fit_transform(y)

# LabelEncoder transforms the categories into integers starting from 0
print("Encoded categories:", y_encoded)



Encoded categories: [ 2 23 11 ...  1 17  7]


In [66]:
y

0         Blouse
1        Sweater
2          Jeans
3        Sandals
4         Blouse
          ...   
3895      Hoodie
3896    Backpack
3897        Belt
3898       Shoes
3899     Handbag
Name: Item Purchased, Length: 3900, dtype: object

In [67]:
import numpy as np
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.25, random_state=42)

# Define the model with correct number of classes using np.unique()
num_classes = len(np.unique(y_encoded))
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=num_classes, max_depth=3, n_estimators=50,seed=42)

xgb_model.fit(X_train, y_train)

In [68]:
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Train Accuracy: 0.67
Test Accuracy: 0.14


In [69]:
# Get feature importances
importances = xgb_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

In [70]:
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': xgb_model.feature_importances_
})

# Sum the importance of one-hot features for each original feature
original_feature_importances = feature_importance_df.copy()
original_feature_importances['OriginalFeature'] = feature_importance_df['Feature'].str.split('_').str[0]  # assuming default pd.get_dummies() encoding
original_feature_importances = original_feature_importances.groupby('OriginalFeature').agg({'Importance': 'sum'}).reset_index()

# Sort by importance
original_feature_importances = original_feature_importances.sort_values(by='Importance', ascending=False)

print(original_feature_importances)

           OriginalFeature  Importance
1                 Category    0.400960
6                 Location    0.221255
2                    Color    0.137295
4   Frequency of Purchases    0.051957
13           Shipping Type    0.041007
7           Payment Method    0.038144
14                    Size    0.033930
12                  Season    0.025890
3         Discount Applied    0.008486
10   Purchase Amount (USD)    0.007723
11           Review Rating    0.007334
8       Previous Purchases    0.007177
0                      Age    0.007035
15     Subscription Status    0.006159
5                   Gender    0.005648
9          Promo Code Used    0.000000


In [None]:
# After encode the data back to original data and listed down the importance of each variable

In [40]:
#We found that "Category" and "Location" are most important in predicting item the customer will purchase

In [None]:
#Use trained model to recommend an item to customer

In [77]:
new_customer_data = {
    'Age': [25],
    'Gender': ['Female'],
    'Category': ['Clothing'],
    'Location': ['Oregon'],
    'Size': ['M'],
    'Color': ['Red'],
    'Season': ['Winter'],
    'Subscription Status': ['Yes'],
    'Discount Applied': ['Yes'],
    'Frequency of Purchases': ['Annually'],
    'Review Rating': [5],
    'Shipping Type': ['Free Shipping'],    
    'Payment Method': ['Credit Card'], 
    'Previous Purchases': [20],   
    'Purchase Amount (USD)': [90], 
    'Promo Code Used': ['No'],  
}

# Convert the new customer data to a DataFrame
new_data_df = pd.DataFrame(new_customer_data)

In [78]:
new_data_encoded = pd.get_dummies(new_data_df)
new_data_encoded = new_data_encoded.reindex(columns=X_train.columns, fill_value=0)

In [79]:
predicted_class = xgb_model.predict(new_data_encoded)
# Decode the predicted class
predicted_item = label_encoder.inverse_transform(predicted_class)[0]
print(f"Predicted Item Purchased: {predicted_item}")

Predicted Item Purchased: T-shirt
