In [5]:
import numpy as np
import pandas as pd
import sys
import json
print(sys.executable)
# python version 3.11.9

COLLECTION_PATH = '../data/train/collection.tsv'
COLLECTION_OUTPUT_PATH = '../data/train/p_collection.tsv'
COLLECTION_OUTPUT_PATH_SMALL = '../data/train/p_collection_small.tsv'
QREL_TRAIN_PATH = '../data/train/QREL/train.qrels'
QREL_DEV_PATH = '../data/train/QREL/dev.qrels'
PASSAGES_PATH='../data/train/train.jsonl'
PASSAGES_OUTPUT_PATH='../data/train/p_train.jsonl'
QUERY_PATH = '../data/train/qid2query.tsv'

/Users/lukasburtscher/Desktop/tuwien/msc/enhanced-product-search-llm/venv/bin/python


In [6]:
collection = pd.read_csv(COLLECTION_PATH, sep='\t', names=['id', 'title', 'description'], header=None)
collection.head()

Unnamed: 0,id,title,description
0,1,FYY Leather Case with Mirror for Samsung Galax...,Product Description Premium PU Leather Top qua...
1,2,"Playtex Women's 18 Hour Easy On, Easy Off Fron...",Product Description Introducing Playtex 18 hou...
2,4,YUEPIN U-Tube Clamp 304 Stainless Steel Hose P...,Product Description Specification: Material: 3...
3,5,Bruce's Big Storm (Mother Bruce Series),
4,6,DJI Shoulder Neck Strap Belt Sling Lanyard Nec...,Product Description Specifications: Item Condi...


In [4]:
print('Original Collection')
print(collection.shape)
print(collection.isnull().sum())
print('\n')
print('Cleaned Collection')
col_cleaned = collection.dropna(subset=['description'])
print(col_cleaned.shape)
print(col_cleaned.isnull().sum())

Original Collection
(1118640, 3)
id                  0
title           38556
description    222913
dtype: int64


Cleaned Collection
(895727, 3)
id              0
title          69
description     0
dtype: int64


In [4]:
# load the jsonl file
valid_docids = set(col_cleaned['id'])

# Function to filter passages
def filter_passages(passages, valid_docids):
    return [passage for passage in passages if passage['docid'] in valid_docids]


with open(PASSAGES_PATH, 'r') as infile, open(PASSAGES_OUTPUT_PATH, 'w') as outfile:
    for line in infile:
        data = json.loads(line)
        data['positive_passages'] = filter_passages(data.get('positive_passages', []), valid_docids)
        data['negative_passages'] = filter_passages(data.get('negative_passages', []), valid_docids)
        
        # Only write the line if there are still positive or negative passages left
        if data['positive_passages'] or data['negative_passages']:
            outfile.write(json.dumps(data) + '\n')

print("Filtering complete.")


Filtering complete.


In [11]:
# create a validation QREL and query set for hyperparamter tuning
qrel = pd.read_csv(QREL_DEV_PATH, sep='\t', names=['qid', '0', 'docid', 'relevance_score'], header=None)
query = pd.read_csv(QUERY_PATH, sep='\t', names=['qid', 'text'], header=None)
# Filter out queries not present in both dataframes
common_qids = set(qrel['qid']).intersection(set(query['qid']))
qrel = qrel[qrel['qid'].isin(common_qids)]
query = query[query['qid'].isin(common_qids)]
print(qrel.shape)
print(query.shape)

# Create a dictionary of qid to query text
query_dict = pd.Series(query.text.values, index=query.qid).to_dict()

# Prepare the evaluation dataset
evaluation_data = []
for qid in common_qids:
    query_text = query_dict[qid]
    relevance_data = qrel[qrel['qid'] == qid]
    for _, row in relevance_data.iterrows():
        evaluation_data.append((query_text, row['docid'], row['relevance_score']))


   qid  0    docid  relevance_score
0    2  0   529863                0
1    2  0   686755                0
2    2  0    20963                3
3    2  0  1603474                2
4    2  0  1061163                0
(169952, 4)
(8954, 2)


Unnamed: 0,qid,text
20888,2,!qscreen fence without holes
20889,12,#1 black natural hair dye without ammonia or p...
20890,13,#1 rated resveratrol supplement without tea le...
20891,18,#10 envelopes without security tint
20892,19,#10 standard no tint no window not self seal


In [5]:
col_cleaned.to_csv(COLLECTION_OUTPUT_PATH, sep='\t', index=False)
col_cleaned.head(10000).to_csv(COLLECTION_OUTPUT_PATH_SMALL, sep='\t', index=False)

In [6]:
col_cleaned = pd.read_csv(COLLECTION_OUTPUT_PATH, sep='\t')
col_cleaned.head()

Unnamed: 0,id,title,description
0,1,FYY Leather Case with Mirror for Samsung Galax...,Product Description Premium PU Leather Top qua...
1,2,"Playtex Women's 18 Hour Easy On, Easy Off Fron...",Product Description Introducing Playtex 18 hou...
2,4,YUEPIN U-Tube Clamp 304 Stainless Steel Hose P...,Product Description Specification: Material: 3...
3,6,DJI Shoulder Neck Strap Belt Sling Lanyard Nec...,Product Description Specifications: Item Condi...
4,7,Crocs Jibbitz 5-Pack Alien Shoe Charms | Jibbi...,From the brand Previous page Shop Crocs Collec...
