<a href="https://colab.research.google.com/github/llawlaw23/Recommendation-System/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import os
import gc
import re

In [None]:
os.makedirs("cleaned", exist_ok = True)

In [50]:
props1 = pd.read_csv("item_properties_part1.1.csv")
props2 = pd.read_csv("item_properties_part2.csv")
props = pd.concat([props1, props2])

In [11]:
props.shape

(20275902, 4)

In [None]:
props.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [None]:
events = pd.read_csv("events.csv")
cats = pd.read_csv("category_tree.csv")

### Data Understanding & Preprocessing

#### 1. Data Overview
- Brief description of the dataset ( size, number of features, number of columns and rows, and observations)
- Types of variables (numerical, categorical, datetime, etc.)
- Initial observations about the data (e.g., imbalance, missing data, outliers)

#### 2. Data Quality Checks
- Check for missing values and filling or dropping them.
- dropping columns.
- checking for outliers.

In [None]:
events["timestamp"] = pd.to_datetime(events["timestamp"], unit = "ms")
props["timestamp"] = pd.to_datetime(props["timestamp"], unit = "ms")

In [None]:
events.shape

(2756101, 5)

In [None]:
events.isnull().sum()

Unnamed: 0,0
timestamp,0
visitorid,0
event,0
itemid,0
transactionid,2733644


In [None]:
events = events.drop(columns = ["transactionid"])

In [None]:
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid
0,2015-06-02 05:02:12.117,257597,view,355908
1,2015-06-02 05:50:14.164,992329,view,248676
2,2015-06-02 05:13:19.827,111016,view,318965
3,2015-06-02 05:12:35.914,483717,view,253185
4,2015-06-02 05:02:17.106,951259,view,367447


In [None]:
# map view = 1 add to chart = 3 and purchase  = 5, replacing it in the old column. dataype to int
event_weights = {"view": 1, "addtocart": 3, "transaction": 5}
events["event"] = events["event"].map(event_weights)

In [None]:
# change datatype of event to int
events["event"] = events["event"].fillna(0).astype(int)

In [None]:
events.head(20)

Unnamed: 0,timestamp,visitorid,event,itemid
0,2015-06-02 05:02:12.117,257597,1,355908
1,2015-06-02 05:50:14.164,992329,1,248676
2,2015-06-02 05:13:19.827,111016,1,318965
3,2015-06-02 05:12:35.914,483717,1,253185
4,2015-06-02 05:02:17.106,951259,1,367447
5,2015-06-02 05:48:06.234,972639,1,22556
6,2015-06-02 05:12:03.240,810725,1,443030
7,2015-06-02 05:34:51.897,794181,1,439202
8,2015-06-02 04:54:59.221,824915,1,428805
9,2015-06-02 05:00:04.592,339335,1,82389


In [None]:
# min in visitorid
events["visitorid"].min()

0

checking for outliers and clearing them

In [None]:
visitor_event = events.groupby("visitorid")["event"].count()

In [None]:
# print in descending order
visitor_event = visitor_event.sort_values(ascending = False)

In [None]:
visitor_event.head(20)

Unnamed: 0_level_0,event
visitorid,Unnamed: 1_level_1
1150086,7757
530559,4328
152963,3024
895999,2474
163561,2410
371606,2345
286616,2252
684514,2246
892013,2024
861299,1991


In [None]:
Q1 = visitor_event.quantile(0.25)
Q3 = visitor_event.quantile(0.75)
IQR = Q3 - Q1

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
upper_bound

np.float64(3.5)

In [None]:
lower_bound

np.float64(-0.5)

In [None]:
outlier_visitor_ids = visitor_event[(visitor_event < lower_bound) | (visitor_event > upper_bound)].index

In [None]:
is_outlier = events["visitorid"].isin(outlier_visitor_ids)

In [None]:
print (is_outlier)

0          False
1           True
2          False
3          False
4          False
           ...  
2756096    False
2756097    False
2756098    False
2756099    False
2756100    False
Name: visitorid, Length: 2756101, dtype: bool


In [None]:
events_F = events[~is_outlier]

In [None]:
events_F.head()

Unnamed: 0,timestamp,visitorid,event,itemid
0,2015-06-02 05:02:12.117,257597,1,355908
2,2015-06-02 05:13:19.827,111016,1,318965
3,2015-06-02 05:12:35.914,483717,1,253185
4,2015-06-02 05:02:17.106,951259,1,367447
5,2015-06-02 05:48:06.234,972639,1,22556


In [None]:
events_F.shape

(1652380, 4)

In [27]:
props.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [51]:
itemid_value = props.groupby("itemid")["value"].count()

In [52]:
Q1 = itemid_value.quantile(0.25)
Q3 = itemid_value.quantile(0.75)
IQR = Q3 - Q1

In [53]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [54]:
outlier_itemid_value = itemid_value[(itemid_value < lower_bound) | (itemid_value > upper_bound)].index

In [55]:
is_outlier = props["itemid"].isin(outlier_itemid_value)

In [56]:
props = props[~is_outlier]

In [57]:
props.shape

(17520979, 4)

In [58]:
# clean the value column and keep values with n attached to them and add them if they are multiple
def sum_n_values(text):
    nums = re.findall(r"n([\d\.]+)", str(text))
    nums = [float(x) for x in nums]
    return sum(nums) if nums else None

props["value_n"] = props["value"].apply(sum_n_values)

In [59]:
props = props.drop(columns = ["value"])

In [45]:
props.head()

Unnamed: 0,timestamp,itemid,property,value_n
0,1435460400000,460429,categoryid,
1,1441508400000,206783,888,277.2
2,1439089200000,395014,400,1272.0
3,1431226800000,59481,790,15360.0
4,1431831600000,156781,917,


In [60]:
# Full null with zero meaning the item as no price
props["value_n"] = props["value_n"].fillna(0)

In [61]:
props.head()

Unnamed: 0,timestamp,itemid,property,value_n
0,1435460400000,460429,categoryid,0.0
1,1441508400000,206783,888,277.2
2,1439089200000,395014,400,1272.0
3,1431226800000,59481,790,15360.0
4,1431831600000,156781,917,0.0


In [None]:
cats.isnull().sum()

Unnamed: 0,0
categoryid,0
parentid,25


In [None]:
cats = cats.dropna()

In [62]:
# saved the clean csv as an new file in the cleaned folder and delete the previous one
props.to_csv("props_cleaned.csv", index = False)
del props; gc.collect()

0

In [None]:
# saved the clean csv as an new file in the cleaned folder and delete the previous one
events_F.to_csv("events_cleaned.csv", index = False)
del events_F; gc.collect()

In [None]:
# saved the clean csv as an new file in the cleaned folder and delete the previous one
cats.to_csv("cats_cleaned.csv", index = False)
del cats; gc.collect()

In [63]:
# Loading cleaned csv files
props_cleaned = pd.read_csv("props_cleaned.csv")

events_cleaned = pd.read_csv("events_cleaned.csv")

cats_cleaned = pd.read_csv("cats_cleaned.csv")

Creating My user-item matric and item feature Matrix

In [4]:
from scipy.sparse import csr_matrix
import pandas as pd
# from implicit.als import AlternatingLeastSquares
from scipy.sparse import coo_matrix
import numpy as np

In [153]:
events_cleaned.head()

Unnamed: 0,timestamp,visitorid,event,itemid
0,2015-06-02 05:02:12.117,257597,1,355908
1,2015-06-02 05:13:19.827,111016,1,318965
2,2015-06-02 05:12:35.914,483717,1,253185
3,2015-06-02 05:02:17.106,951259,1,367447
4,2015-06-02 05:48:06.234,972639,1,22556


In [5]:
visitor2_idx = {v: i for i, v in enumerate(events_cleaned["visitorid"].unique())}
item2_idx = {v: i for i, v in enumerate(events_cleaned["itemid"].unique())}

In [6]:
events_cleaned["visitor_idx"] = events_cleaned["visitorid"].map(visitor2_idx,)

events_cleaned["item_idx"] = events_cleaned["itemid"].map(item2_idx)

In [7]:
user_item_matrix = coo_matrix(
    (events_cleaned["event"], (events_cleaned["visitor_idx"], events_cleaned["item_idx"]))
)

In [9]:
categories = props["property"].unique().compute().tolist()

NameError: name 'props' is not defined

In [None]:
props["property"] = props["property"].astype(pd.CategoricalDtype(categories = categories))

In [None]:
item_feature_matrix = props.pivot_table(
    index = "itemid",
    columns = "property",
    values = "value",
    aggfunc = "first"
)

In [None]:
print(item_feature_matrix)

Dask DataFrame Structure:
                   888      400      790      451        0      566      134       19      881      188      663      619       60      917      202      719      964      681      810      175      631     1075      720      689      314      784      807      671      931      283      891      726      607      544      942      758     1083      225      987      204      569     1058     1066      141      884      761      674     1054      389      104      441      452      221      976      456      423      281      468      470      319      749      994      900      603      805      981      102      307      909      669      771      465      586      546      275      495      638      685      206      768      372      120       96      936       69      841      594      611      562      961      966      654      373      542       42      419      795       30      734      953      645      277      803       98      140      220       

Merging With Item properties and categories

In [None]:
coo = user_item_matrix.tocoo()

In [None]:
visitor_ids = events_F["visitorid"].unique().compute()
item_ids = events_F["itemid"].unique().compute()

In [None]:
events_idx = events_F.compute()
events_idx["visitor_idx"] = events_idx["visitor_idx"].astype("int32")
events_idx["item_idx"] = events_idx["item_idx"].astype("int32")

In [None]:
events_idx = events_idx.dropna(subset=["visitor_idx", "item_idx"])

In [None]:
coo = coo_matrix(
    (events_idx["interaction_strength"],
     (events_idx["visitor_idx"], events_idx["item_idx"]))
)

In [None]:
df = pd.DataFrame({
    "visitor_idx": coo.row,
    "item_idx": coo.col,
    "interaction_strength": coo.data
})

In [None]:
events_idx["visitor_idx"] = events_idx["visitorid"].map(visitor2idx)
events_idx["item_idx"] = events_idx["itemid"].map(item2idx)

In [None]:
item_feature_matrix = item_feature_matrix.compute()

user_item_features = df.merge(
    item_feature_matrix,
    on="itemid",
    how="left"
)

MemoryError: Unable to allocate 1.87 GiB for an array with shape (251065907,) and data type int64

model fitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, test_df = train_test_split(
    user_item_features,
    test_size=0.2,
    random_state=42
)

In [None]:

X_train = train_df.drop(["visitorid", "itemid", "interaction_strength"], axis=1)
y_train = train_df["interaction_strength"]

In [None]:
X_test = test_df.drop(["visitorid", "itemid", "interaction_strength"], axis=1)
y_test = test_df["interaction_strength"]

In [None]:
als_model = AlternatingLeastSquares(factors = 50, regularization = 0.1, iterations = 20)
als_model.fit(user_item_matrix)

In [None]:
user_recs = {}
N = 10

for user_idx in range(user_item_matrix.shape[0]):
    recommended = als_model.recommend(user_idx, user_item_matrix[user_idx], N = N)
    user_recs[user_idx] = [(item_idx, score) for item_idx, score in recommended]

In [None]:
recs_df = pd.DataFrame(
    [(user, item, score) for user, recs in user_recs.items() for item, score in recs],
    columns = ["visitor_idx", "item_idx", "score"]
)

### Data Visualization and Analysing Business Question




- How do user demographics influence their preferences and interactions?

- What products are frequently purchased or viewed together?

- How do popularity metrics evolve monthly or seasonally?

- Which items frequently co-occur in transactions?

- How do preferences differ across age, location, or other demographics?

In [162]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

Q1
- How many unique visitors in a month?

In [156]:
events_cleaned["timestamp"] = pd.to_datetime(events_cleaned["timestamp"])

In [157]:
events_cleaned.head()

Unnamed: 0,timestamp,visitorid,event,itemid
0,2015-06-02 05:02:12.117,257597,1,355908
1,2015-06-02 05:13:19.827,111016,1,318965
2,2015-06-02 05:12:35.914,483717,1,253185
3,2015-06-02 05:02:17.106,951259,1,367447
4,2015-06-02 05:48:06.234,972639,1,22556


In [158]:
events_cleaned["year_month"] = events_cleaned["timestamp"].dt.to_period("M")

In [159]:
monthly_visitors = events_cleaned.groupby("year_month")["visitorid"].nunique().reset_index()

In [160]:

print(monthly_visitors.head(10))

  year_month  visitorid
0    2015-05     274302
1    2015-06     276649
2    2015-07     337179
3    2015-08     279387
4    2015-09     155952


In [164]:
monthly_visitors["year_month"] = monthly_visitors["year_month"].astype(str)

In [170]:
color_map = {
    "2015-05": "#F564A9",
    "Jun 2015": "#98A1BC",
    "Jul 2015": "#DED3C4",
    "Aug 2015": "#264653",
    "Sep 2015": "#670D2F"
}

fig = px.bar(
    monthly_visitors,
    x = "year_month",
    y = "visitorid",
    color_discrete_map = color_map,
    title = "Unique Visitors per Month",
    labels = {"year_month": "Month", "visitorid": "Unique Visitors"},
    text = "visitorid"
)
fig.update_layout(
    legend_title_text = "Month",
    xaxis_title = "Month",
    yaxis = dict(
        showticklabels = False
    )
)

fig.show()

Q2
- What patterns exist in user-item interactions over time?

In [172]:
monthly_interactions = (
    events_cleaned.groupby("year_month")["visitorid"]
    .count()
    .reset_index(name="total_interactions")
)

In [174]:
monthly_interactions["year_month"] = monthly_interactions["year_month"].dt.to_timestamp()


In [175]:

fig = px.bar(
    monthly_interactions,
    x = "year_month",
    y = "total_interactions",
    title = "Monthly User-Item Interactions",
    labels = {"year_month": "Month", "total_interactions": "Number of Interactions"},
)

fig.update_layout(
    legend_title_text = "Month",
    xaxis_title = "Month",
    yaxis = dict(
        showticklabels = False
    )
)

fig.show()
