In [1]:
import numpy as np
import pandas as pd
from datasets import Dataset
import sys
import json
import jsonlines
print(sys.executable)
# python version 3.11.9

COLLECTION_PATH = '../data/collection.tsv'
COLLECTION_OUTPUT_PATH = '../data/p_collection.tsv'
COLLECTION_OUTPUT_PATH_SMALL = '../data/p_collection_small.tsv'
QREL_TRAIN_PATH = '../data/QREL/train.qrels'
QREL_DEV_PATH = '../data/QREL/dev.qrels'
PASSAGES_PATH='../data/train.jsonl'
PASSAGES_OUTPUT_PATH='../data/p_train.jsonl'
QUERY_PRODUCT_PATH='../data/query_product.tsv'
QUERY_PATH = '../data/qid2query.tsv'

  from .autonotebook import tqdm as notebook_tqdm


/Users/lukasburtscher/Desktop/tuwien/msc/enhanced-product-search-llm/venv/bin/python


In [2]:
collection = pd.read_csv(COLLECTION_PATH, sep='\t', names=['id', 'title', 'description'], header=None)
collection.head()

Unnamed: 0,id,title,description
0,1,FYY Leather Case with Mirror for Samsung Galax...,Product Description Premium PU Leather Top qua...
1,2,"Playtex Women's 18 Hour Easy On, Easy Off Fron...",Product Description Introducing Playtex 18 hou...
2,4,YUEPIN U-Tube Clamp 304 Stainless Steel Hose P...,Product Description Specification: Material: 3...
3,5,Bruce's Big Storm (Mother Bruce Series),
4,6,DJI Shoulder Neck Strap Belt Sling Lanyard Nec...,Product Description Specifications: Item Condi...


In [3]:
useless_d = ['From the manufacturer Previous page Next page Previous page Next page Previous page Next page  ',
            'Product Description Read more ', 'Product Description ', 'From the manufacturer Read more Read more Read more Read more  ',
             'Product Description 1 Product Description 1', 'Product Description Read more Read more ', ''
            ]

def clean_collection_data(df, d = 'description', t = 'title'):
    print('Original Collection')
    print(df.shape)
    print(df.isnull().sum(), '\n')
    df = df.drop_duplicates(subset=[d, t])
    df.loc[df[d].isin(useless_d), d] = None
    df.loc[df[t].isin(useless_d), t] = None
    df = df.dropna(subset=[d,t], how='all')
    df[d] = df[d].str.replace('Read more', '')
    df[d] = df[d].str.replace('Product Description', '')
    df['product_text'] = df[t].fillna('') + ' ' + df[d].fillna('')
    print(f"New shape after cleaning product descriptions': {df.shape}")
    print(df.isnull().sum())
    return df.copy()
    

### Preprocess Collection frame

In [5]:

col_cleaned = clean_collection_data(collection)

Original Collection
(1118640, 3)
id                  0
title           38556
description    222913
dtype: int64 

New shape after cleaning product descriptions': (980974, 4)
id                   0
title               69
description     184780
product_text         0
dtype: int64


### Preprocess train.jsonl
- generate a product_description to query data frame for matches

In [6]:
# load the jsonl file
valid_docids = set(col_cleaned['id'])

# Function to filter passages
def filter_passages(passages, valid_docids):
    return [passage for passage in passages if passage['docid'] in valid_docids]


with open(PASSAGES_PATH, 'r') as infile, open(PASSAGES_OUTPUT_PATH, 'w') as outfile:
    for line in infile:
        data = json.loads(line)
        data['positive_passages'] = filter_passages(data.get('positive_passages', []), valid_docids)
        data['negative_passages'] = filter_passages(data.get('negative_passages', []), valid_docids)
        
        # Only write the line if there are still positive passages left
        if data['positive_passages']:
            outfile.write(json.dumps(data) + '\n')

print("Filtering complete.")


Filtering complete.


In [7]:
#generate query product description set
data = []
with jsonlines.open(PASSAGES_OUTPUT_PATH) as reader:
    for obj in reader:
        data.append(obj)

train_data = []
for entry in data:
    for pos_passage in entry["positive_passages"]:
        train_data.append({
                    "query_id": entry["query_id"],
                    "query": entry["query"],
                    "product_description": pos_passage["text"],
                    "product_id": pos_passage["docid"]
                })

df_q_p = pd.DataFrame(train_data)
df_q_p = pd.merge(df_q_p, col_cleaned[['id', 'title']], left_on='product_id', right_on='id', how='left')
df_q_p = df_q_p.drop('id', axis=1)
print(df_q_p.shape)
df_q_p.head()

(342964, 5)


Unnamed: 0,query_id,query,product_description,product_id,title
0,1,!awnmower tires without rims,Product Description Read more Read more All-pu...,1049092,2 Pack 10-Inch Tires and Wheels 4.10/3.50-4 Re...
1,1,!awnmower tires without rims,From the brand Previous page MaxAuto is a thri...,314519,MaxAuto 13x5.00-6 Lawn Mower Tires with Rim 13...
2,1,!awnmower tires without rims,Product Description 20601A Neiko Tire Spoons R...,689593,"NEIKO 20601A 14.5” Steel Tire Spoons Tool Set,..."
3,1,!awnmower tires without rims,,717649,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...
4,1,!awnmower tires without rims,Product Description 15x6.00-6 Husqvarna Replac...,1397156,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...


### Preprocess The Query - Product Description Frame

In [8]:
df_q_p_clean = clean_collection_data(df_q_p, d = 'product_description', t = 'title')

Original Collection
(342964, 5)
query_id                0
query                   0
product_description     0
product_id              0
title                  15
dtype: int64 

New shape after cleaning product descriptions': (285646, 6)
query_id                   0
query                      0
product_description    48312
product_id                 0
title                     10
product_text               0
dtype: int64


#### remove less relevant query - product pairs from the frame

In [9]:
qrel = pd.read_csv(QREL_TRAIN_PATH, sep='\t', names=['qid', '0', 'docid', 'relevance_score'], header=None)
new_df = pd.merge(df_q_p_clean, qrel[['qid', 'docid', 'relevance_score']], left_on=['query_id', 'product_id'], right_on=['qid', 'docid'], how='left')
test = new_df[new_df['relevance_score'] >= 2]
print(test.shape)

df_q_p_clean = test.copy()
test.head()

(226438, 9)


Unnamed: 0,query_id,query,product_description,product_id,title,product_text,qid,docid,relevance_score
1,1,!awnmower tires without rims,From the brand Previous page MaxAuto is a thri...,314519,MaxAuto 13x5.00-6 Lawn Mower Tires with Rim 13...,MaxAuto 13x5.00-6 Lawn Mower Tires with Rim 13...,1,314519,3
3,1,!awnmower tires without rims,,717649,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,1,717649,2
4,1,!awnmower tires without rims,15x6.00-6 Husqvarna Replacement Assemblies Re...,1397156,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,1,1397156,3
5,1,!awnmower tires without rims,From the brand Previous page MaxAuto is a thri...,1095149,MaxAuto 2 Pcs 16x6.50-8 Lawn Mower Tire for Ga...,MaxAuto 2 Pcs 16x6.50-8 Lawn Mower Tire for Ga...,1,1095149,3
7,1,!awnmower tires without rims,Marastar's 2 Pack 15x6 00-6 air filled (pneum...,911618,"MARASTAR 21446-2PK 15x6.00-6"" Front Tire Assem...","MARASTAR 21446-2PK 15x6.00-6"" Front Tire Assem...",1,911618,3


In [6]:
# create a validation QREL and query set for hyperparamter tuning
qrel = pd.read_csv(QREL_DEV_PATH, sep='\t', names=['qid', '0', 'docid', 'relevance_score'], header=None)
query = pd.read_csv(QUERY_PATH, sep='\t', names=['qid', 'text'], header=None)
# Filter out queries not present in both dataframes
common_qids = set(qrel['qid']).intersection(set(query['qid']))
qrel = qrel[qrel['qid'].isin(common_qids)]
query = query[query['qid'].isin(common_qids)]
print(qrel.shape)
print(query.shape)

# Create a dictionary of qid to query text
query_dict = pd.Series(query.text.values, index=query.qid).to_dict()

# Prepare the evaluation dataset
evaluation_data = []
for qid in common_qids:
    query_text = query_dict[qid]
    relevance_data = qrel[qrel['qid'] == qid]
    for _, row in relevance_data.iterrows():
        evaluation_data.append((query_text, row['docid'], row['relevance_score']))


(169952, 4)
(8954, 2)


In [10]:
col_cleaned.to_csv(COLLECTION_OUTPUT_PATH, sep='\t', index=False)
col_cleaned.head(10000).to_csv(COLLECTION_OUTPUT_PATH_SMALL, sep='\t', index=False)
df_q_p_clean.to_csv(QUERY_PRODUCT_PATH, sep='\t', index=False)

In [8]:
col_cleaned = pd.read_csv(COLLECTION_OUTPUT_PATH, sep='\t')
col_cleaned.head()

Unnamed: 0,id,title,description
0,1,FYY Leather Case with Mirror for Samsung Galax...,Product Description Premium PU Leather Top qua...
1,2,"Playtex Women's 18 Hour Easy On, Easy Off Fron...",Product Description Introducing Playtex 18 hou...
2,4,YUEPIN U-Tube Clamp 304 Stainless Steel Hose P...,Product Description Specification: Material: 3...
3,6,DJI Shoulder Neck Strap Belt Sling Lanyard Nec...,Product Description Specifications: Item Condi...
4,7,Crocs Jibbitz 5-Pack Alien Shoe Charms | Jibbi...,From the brand Previous page Shop Crocs Collec...
