# Customer Shopping Trends EDA/ML Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
# warnings.filterwarnings('ignore')

In [2]:
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("bhadramohit/customer-shopping-latest-trends-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/dude/.cache/kagglehub/datasets/bhadramohit/customer-shopping-latest-trends-dataset/versions/1


In [3]:
os.listdir(path)

['shopping_trends.csv']

In [4]:
df = pd.read_csv(path + '/shopping_trends.csv')
df.drop('Customer ID', axis=1, inplace=True)

In [11]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=1)

# Make sure data is split correctly
print(f"Train Shape {train_df.shape}")
print(f"Test Shape {test_df.shape}")
print(f"Validation Shape{val_df.shape}")

Train Shape (3120, 18)
Test Shape (390, 18)
Validation Shape(390, 18)


In [12]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3120 entries, 3304 to 1061
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       3120 non-null   int64  
 1   Gender                    3120 non-null   object 
 2   Item Purchased            3120 non-null   object 
 3   Category                  3120 non-null   object 
 4   Purchase Amount (USD)     3120 non-null   int64  
 5   Location                  3120 non-null   object 
 6   Size                      3120 non-null   object 
 7   Color                     3120 non-null   object 
 8   Season                    3120 non-null   object 
 9   Review Rating             3120 non-null   float64
 10  Subscription Status       3120 non-null   object 
 11  Payment Method            3120 non-null   object 
 12  Shipping Type             3120 non-null   object 
 13  Discount Applied          3120 non-null   object 
 14  Promo Code

In [13]:
train_df.head()

Unnamed: 0,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Payment Method,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Preferred Payment Method,Frequency of Purchases
3304,30,Female,Handbag,Accessories,79,Mississippi,XL,Orange,Fall,4.8,No,Debit Card,Next Day Air,No,No,36,Bank Transfer,Every 3 Months
2756,42,Female,Hat,Accessories,23,Minnesota,L,Violet,Winter,4.1,No,Bank Transfer,Free Shipping,No,No,41,PayPal,Monthly
516,34,Male,Hoodie,Clothing,41,West Virginia,L,Indigo,Fall,2.8,Yes,Cash,2-Day Shipping,Yes,Yes,19,Debit Card,Bi-Weekly
3621,38,Female,Sneakers,Footwear,32,New Jersey,XL,Silver,Summer,4.0,No,Venmo,2-Day Shipping,No,No,5,Cash,Fortnightly
2872,63,Female,Jewelry,Accessories,48,Connecticut,L,Olive,Fall,2.6,No,Venmo,Next Day Air,No,No,5,Cash,Annually


## Plans

1. <b>Do simple EDA try and answer some questions:</b>
    <br>- What are the most common payment types?
    <br>- Which seasons lead to most money spent?
    <br>- Do reviews impact how much a customer spends?
3. <b>Do clustering to try and find useful specific categories of customers</b>
4. <b>Create a total value feature and use it as the target for a regression</b>

## Creating total value feature<br>
This feature maybe useful in EDA but will be necessary in the final step of project

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3120 entries, 3304 to 1061
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       3120 non-null   int64  
 1   Gender                    3120 non-null   object 
 2   Item Purchased            3120 non-null   object 
 3   Category                  3120 non-null   object 
 4   Purchase Amount (USD)     3120 non-null   int64  
 5   Location                  3120 non-null   object 
 6   Size                      3120 non-null   object 
 7   Color                     3120 non-null   object 
 8   Season                    3120 non-null   object 
 9   Review Rating             3120 non-null   float64
 10  Subscription Status       3120 non-null   object 
 11  Payment Method            3120 non-null   object 
 12  Shipping Type             3120 non-null   object 
 13  Discount Applied          3120 non-null   object 
 14  Promo Code

This feature will be a predicted total value 5 years into future. It will include the total value of customer to date + predicted value 5 years after. I believe this feature could help business make benifical desicions. 

In [16]:
""" 
Features needed for this engineered feature: purchase amount, frequency of purchases, and previous purchases
"""

amount = train_df['Purchase Amount (USD)']
prev_purchases = train_df['Previous Purchases']
freq_purchases = train_df['Frequency of Purchases']

In [21]:
# Get unique frequencies
unique_ferq = list(freq_purchases.unique())
unique_ferq

['Every 3 Months',
 'Monthly',
 'Bi-Weekly',
 'Fortnightly',
 'Annually',
 'Quarterly',
 'Weekly']

In [23]:
# Make hash map to connect frequency to how many times it occurs
freq_map = {
    'Every 3 Months': 4,
    'Monthly': 12,
    'Bi-Weekly': 26,
    'Fortnightly': 26, # Means samething as bi-weekly
    'Annually': 1,
    'Quarterly': 4, # Same thing as every 3 months
    'Weekly': 52
}