## Import libraries

In [None]:
from google.colab import drive
import numpy as np
import pandas as pd
import os
from pathlib import Path
import glob

## Extract data

In [None]:
# 1) Imports & mount
from google.colab import drive
drive.mount("/content/drive")

# 2) Paths
BASE = Path("/content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data")
PRODUCT_TXT = BASE / "product_meta_data.txt"
USER_TXT    = BASE / "user_behavior_data.txt"
CHUNKS_DIR  = BASE / "product_chunks"
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)

# 3) Load small user data
user_df = pd.read_csv(USER_TXT, sep="\t", dtype=str, low_memory=False)
print("user_behaviour shape:", user_df.shape)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
user_behaviour shape: (173831, 7)


In [None]:
# 4) Stream big product TSV → parquet chunks
chunk_size = 200_000
for i, chunk in enumerate(pd.read_csv(PRODUCT_TXT, sep="\t", dtype=str, chunksize=chunk_size, low_memory=False)):
    out = CHUNKS_DIR / f"product_chunk_{i:05d}.parquet"
    chunk.to_parquet(out, index=False)
    print(f"Saved chunk {i} ({len(chunk)} rows)")

In [None]:
# 5) Read all product chunks as one dataframe
chunk_files = sorted(glob.glob(str(CHUNKS_DIR / "*.parquet")))
product_df = pd.concat([pd.read_parquet(f) for f in chunk_files], ignore_index=True)
print(product_df.shape)

(12141247, 13)


In [None]:
user_df.head()

Unnamed: 0,query,candidate_wid_list,candidate_label_list,history_qry_list,history_wid_list,history_type_list,history_time_list
0,63995226602574196100244625575117,36098226_17729172_34921664_68317377_693764_360...,1.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0....,-1_-1_-1_68784030_36547097602574193663289_95...,9227588_52788392_4585436_87422901_57364733_210...,ORD_ORD_CART_CART_CART_CART_CART_CART_CART_CAR...,0_0_22967659_82448_432_10315_51_188_51_254173_...
1,1254963427438884,30857980_44577431_1485384_63372823_19754701_29...,1.0_1.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0....,9755613572694219_46916741_-1_-1_-1_-1_-1_4927...,18256630_46206270_65230659_86878921_32743602_5...,ORD_ORD_ORD_CART_CART_CART_CART_ORD_CART_CART_...,0_4813478_1115463_14061983_920_467229_304_1197...
2,24374376067768939206305,42155506_90092899_4336287_74203839_39478459_37...,1.0_1.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0....,-1_-1,8133925_59478082,CLICK_CLICK,0_27_49660
3,29463948632095349356845164880503931105432...,97234676_89827009_19973201_83859812_22172053_3...,1.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0....,3426234151020854598336931355834712920960_3...,49854616_56947170_16552251_91485122_25087051_7...,CLICK_CLICK_CLICK_CLICK_CLICK_CLICK_CLICK_CLIC...,0_12_4_6_4_14_6_6_4_611_58095_1524_481_20_363_...
4,2793914964365783663289,19711065_83045333_8307571_20337381_11912759_53...,1.0_1.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0....,25499566_24358647_606776896025741971768678_3...,68701_66196939_45800470_18419519_10090448_8073...,ORD_ORD_ORD_ORD_ORD_ORD_ORD_ORD_ORD_ORD_ORD_OR...,0_0_2015453_958482_0_79443_854291_13678_119386...


In [None]:
product_df.head()

Unnamed: 0,wid,name,brand_id,brand_name,cate_id_1,cate_name_1,cate_id_2,cate_name_2,cate_id_3,cate_name_3,cate_id_4,cate_name_4,shop_id
0,84303384,56009272602574199515329858899060608706386...,81493954,56009272,22259905,44386912,80083037,2787702149701337,13599099,1902766526921156,50485737,4746075788309887,86521100
1,58965086,27823200249212549809550743221450974479927...,15444334,27823200417877058264293359865766,68949654,7722037220300932,58682510,20300932,37322654,9194357497447992,99504987,9194357497447992,1462363
2,33811859,68082682602574199445877660257419536255837...,39479430,68082682,22259905,44386912,80083037,2787702149701337,93252909,20484919753274826921156,42385237,20484912257881342477615,86521100
3,90290292,89270446755918046025741917635585029871926...,75019956,8927044675591804,75739187,176355875210311,9682321,176355863043105,56674307,50298719,65420542,50298719,9644527
4,28171378,87851606417877054043879959056538602574195...,55412149,87851606417877054043879959865766,44687275,47087603,38428440,7678748775210311,56166561,5803341578344560,60897751,5803341578344560,60136421


## Helper functions

In [None]:
def parse_candidate_list(candidate_str):
    """Parse candidate product IDs separated by underscores"""
    return candidate_str.split('_')

def parse_label_list(label_str):
    """Parse interaction labels as floats"""
    return [float(x) for x in label_str.split('_')]

def parse_history_queries(query_str):
    """Parse history query list, handling -1 for query-less interactions"""
    return query_str.split('_')

def parse_history_wids(wid_str):
    """Parse history product IDs"""
    return wid_str.split('_')

def parse_history_types(type_str):
    """Parse interaction types (ORD, CLICK, CART, etc.)"""
    return type_str.split('_')

def parse_time_list(time_str):
    """Parse time intervals as integers"""
    return [int(x) for x in time_str.split('_')]

## Sampling #1: remove products which don't appear in either the candidate list or the history wid list

In [None]:

# 1) Build the needed ID set (strings)
cand_ids = {wid for s in user_df["candidate_wid_list"].dropna().astype(str)
            for wid in parse_candidate_list(s)}
hist_ids = {wid for s in user_df["history_wid_list"].dropna().astype(str)
            for wid in parse_history_wids(s)}
need_ids = cand_ids | hist_ids
print("unique needed wids:", len(need_ids))

# 2) Stream product chunks, filter, and write out
in_dir  = "/content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_chunks/*.parquet"
out_dir = Path("/content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered")
out_dir.mkdir(parents=True, exist_ok=True)

keep_cols = ["wid","name","brand_id","brand_name",
             "cate_id_1","cate_name_1","cate_id_2","cate_name_2",
             "cate_id_3","cate_name_3","cate_id_4","cate_name_4","shop_id"]

for i, f in enumerate(sorted(glob.glob(in_dir))):
    part = pd.read_parquet(f, columns=keep_cols)
    m = part["wid"].astype(str).isin(need_ids)
    if m.any():
        part.loc[m].to_parquet(out_dir / f"product_filtered_{i:05d}.parquet", index=False)
        print(f"✔ filtered -> {out_dir}/product_filtered_{i:05d}.parquet ({m.sum()} rows)")
    del part
print("Done (no big dataframe in RAM).")

unique needed wids: 12319978
✔ filtered -> /content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered/product_filtered_00000.parquet (190822 rows)
✔ filtered -> /content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered/product_filtered_00001.parquet (190830 rows)
✔ filtered -> /content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered/product_filtered_00002.parquet (190853 rows)
✔ filtered -> /content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered/product_filtered_00003.parquet (190822 rows)
✔ filtered -> /content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered/product_filtered_00004.parquet (190870 rows)
✔ filtered -> /content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered/product_filtered_00005.parquet (190947 rows)
✔ filtered -> /content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered/product_filtered_00006.parquet (190711 rows

## Sampling 2: Negative sampling
For each query session, we retain all positive samples (label > 0) and randomly select a subset of negative samples with no interactions (label = 0) at a fixed ratio of 1:4 relative to the number of positives, discarding the remaining negatives. (candidate label list)

Product metadata is joined only for products that survive this sampling; products without metadata are excluded.

In [None]:
# 1) Negative sampling per query (keep all positives + 1:4 sampled negatives)
rng = np.random.default_rng(42)  # reproducible

kept_wids = set()
sampled_rows = []

for wid_s, lab_s in user_df[["candidate_wid_list","candidate_label_list"]].itertuples(index=False):
    if not isinstance(wid_s, str) or not isinstance(lab_s, str):
        continue
    wids = parse_candidate_list(wid_s)
    labs = parse_label_list(lab_s)

    pos = [(w,l) for w,l in zip(wids,labs) if l > 0.0]
    neg = [(w,l) for w,l in zip(wids,labs) if l == 0.0]

    k = min(4*len(pos), len(neg))  # 1:4 pos:neg
    neg_sample = rng.choice(len(neg), size=k, replace=False).tolist() if k > 0 else []
    neg_keep = [neg[i] for i in neg_sample]

    kept = pos + neg_keep
    kept_wids.update(w for w,_ in kept)

    # build a compact sampled candidate list/labels row (as strings)
    if kept:
        sampled_rows.append((
            "_".join(w for w,_ in kept),
            "_".join(f"{l:g}" for _,l in kept)
        ))
    else:
        sampled_rows.append(("", ""))

print("unique kept product IDs after sampling:", len(kept_wids))

user_df_sampled = user_df.copy()
user_df_sampled[["candidate_wid_list","candidate_label_list"]] = pd.DataFrame(sampled_rows, index=user_df.index)

unique kept product IDs after sampling: 1269706


In [None]:
OUT_DIR = Path("/content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data")
OUT_DIR.mkdir(parents=True, exist_ok=True)

csv_path = OUT_DIR / "user_behavior_data_sampled.csv"
user_df_sampled.to_csv(csv_path, index=False)
print("saved:", csv_path)


saved: /content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/user_behavior_data_sampled.csv


In [None]:
# 2) Stream-filter product parquet chunks to only the sampled products
in_glob  = "/content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered/*.parquet"
out_dir = Path("/content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered2")
out_dir.mkdir(parents=True, exist_ok=True)

keep_cols = ["wid","name","brand_id","brand_name",
             "cate_id_1","cate_name_1","cate_id_2","cate_name_2",
             "cate_id_3","cate_name_3","cate_id_4","cate_name_4","shop_id"]

total = 0
for i, f in enumerate(sorted(glob.glob(in_glob))):
    part = pd.read_parquet(f, columns=keep_cols)
    m = part["wid"].astype(str).isin(kept_wids)
    if m.any():
        out_f = out_dir / f"product_sampled_{i:05d}.parquet"
        part.loc[m].to_parquet(out_f, index=False)
        total += m.sum()
        print(f"✔ {m.sum():,} kept -> {out_f.name}")
    del part
print("total sampled product rows written:", total)


✔ 20,755 kept -> product_sampled_00000.parquet
✔ 20,795 kept -> product_sampled_00001.parquet
✔ 20,884 kept -> product_sampled_00002.parquet
✔ 20,731 kept -> product_sampled_00003.parquet
✔ 20,758 kept -> product_sampled_00004.parquet
✔ 20,827 kept -> product_sampled_00005.parquet
✔ 20,907 kept -> product_sampled_00006.parquet
✔ 20,581 kept -> product_sampled_00007.parquet
✔ 21,020 kept -> product_sampled_00008.parquet
✔ 20,942 kept -> product_sampled_00009.parquet
✔ 20,903 kept -> product_sampled_00010.parquet
✔ 20,705 kept -> product_sampled_00011.parquet
✔ 20,787 kept -> product_sampled_00012.parquet
✔ 20,626 kept -> product_sampled_00013.parquet
✔ 20,676 kept -> product_sampled_00014.parquet
✔ 20,727 kept -> product_sampled_00015.parquet
✔ 20,809 kept -> product_sampled_00016.parquet
✔ 20,783 kept -> product_sampled_00017.parquet
✔ 20,802 kept -> product_sampled_00018.parquet
✔ 20,736 kept -> product_sampled_00019.parquet
✔ 20,946 kept -> product_sampled_00020.parquet
✔ 20,879 kept

## Load into df and conduct simple data analysis

In [None]:
PROD_DIR = Path("/content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/product_filtered2")

product_df = pd.read_parquet(PROD_DIR, columns=keep_cols, dtype_backend="pyarrow")

print(product_df.shape)
product_df.head()


(1263429, 13)


Unnamed: 0,wid,name,brand_id,brand_name,cate_id_1,cate_name_1,cate_id_2,cate_name_2,cate_id_3,cate_name_3,cate_id_4,cate_name_4,shop_id
0,90290292,89270446755918046025741917635585029871926...,75019956,8927044675591804,75739187,176355875210311,9682321,176355863043105,56674307,50298719,65420542,50298719,9644527
1,84294794,35314470278253305644862259056538602574191...,90358683,56448622,920973,413810773160222581402210,72011611,4138107765056023,77105805,2883009146686422,14729718,2883009146686422,51346695
2,43985934,99002012107475224750389807345717281391786...,99906610,99002012,24675518,2053319081620728,45178979,976474541190979,41488568,10747522807345713385795688352935,8059148,10747522807345713385795688352935,71737804
3,58357343,37269793566964962025051838249885733255404...,67566476,37269793417877055669649659865766,98432384,49629523385795642329532,78943759,42329532,84880674,25517595,16697032,25517595,45062963
4,71803457,53536632602574194190735260257419201104367...,78161242,53536632417877055360263159865766,6867541,50134995,47765755,290298641345793,66118678,41907352,70549743,41907352,24505854


In [None]:
# No. of unique products
print(f"No. of unique products: {product_df["wid"].nunique()}")

# Top categories
print(product_df["cate_name_1"].value_counts().head(10))

# Missingness overview
product_df.isna().mean().sort_values(ascending=False).head(10)


No. of unique products: 1263429
cate_name_1
413810773160222581402210    101657
7722037220300932              90590
223100                         89302
3153887681590330              85364
4338732011476273              78669
9483389095217261              60256
5905495980470873              59222
80488822                       56203
163221                         46355
5889945488456828              42290
Name: count, dtype: int64[pyarrow]


Unnamed: 0,0
brand_name,0.062578
wid,0.0
name,0.0
brand_id,0.0
cate_id_1,0.0
cate_name_1,0.0
cate_id_2,0.0
cate_name_2,0.0
cate_id_3,0.0
cate_name_3,0.0
