In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [None]:
startech_df = pd.read_csv("./datasets/startech.csv")
daraz_df = pd.read_csv("./datasets/daraz.csv")

print(daraz_df.columns, startech_df.columns)

In [None]:
startech_df = startech_df.dropna(subset=["price"]).reset_index(drop=True)
startech_df = startech_df[startech_df["price"] != "To be announce"].copy()
startech_df = startech_df
startech_df = startech_df[["product_name", "price", "product_specifications"]].copy()

In [None]:
startech_df["price"] = startech_df["price"].replace(",", "", regex=True)
startech_df = startech_df.rename(
    columns={"product_name": "title", "product_specifications": "description"}
)
print(startech_df.shape)
startech_df.head(5)

In [None]:
final_5000_products = startech_df.sample(n=5000, random_state=42).reset_index(drop=True)
final_5000_products.to_csv("./final_5000_products.csv", index=False)

In [None]:
bengali_alphabets = "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহয়ড়ঢ়ংঃঁ"
bengali_pattern = re.compile(f"[{bengali_alphabets}]")

bengali_daraz_df = daraz_df[
    daraz_df["title"].apply(lambda x: bool(bengali_pattern.search(x)))
    & daraz_df["description"].apply(lambda x: bool(bengali_pattern.search(x)))
].reset_index(drop=True)

In [None]:
len_bengali_df = bengali_daraz_df.shape[0]
need = 5000 - len_bengali_df

In [None]:
sampled_startech_df = startech_df.sample(
    n=need, random_state=42, replace=False
).reset_index(drop=True)

In [None]:
final_product_df = pd.concat([sampled_startech_df, bengali_daraz_df], axis=0)
print(final_product_df.shape)
final_product_df.head(5)

In [None]:
final_product_df.to_csv("./final_5000_products.csv", index=False)

## Join generated queries dataframe with main dataset


In [6]:
def format_product_details(name, price, description):
    product_details = ""
    if description is not None:
        product_details = f"Name: {name}\nPrice: {price} taka\n{description}"
    else:
        product_details = f"Name: {name}\nPrice: {price} taka"

    return product_details

In [7]:
query_df_1 = pd.read_csv("./datasets/queries_0_2500.csv")
query_df_2 = pd.read_csv("./datasets/queries_2500_5000.csv")
query_df = pd.concat([query_df_1, query_df_2], axis=0)
print(query_df.shape)
query_df.head(5)

(24434, 3)


Unnamed: 0,product_id,relevant_query,irrelevant_query
0,009b7e66-ef69-49fc-87c8-9d40d53e0e33,Casio DJ-120D Plus basic calculator,Best gift for boyfriend
1,009b7e66-ef69-49fc-87c8-9d40d53e0e33,Casio DJ-120D Plus check and recheck calculator,Calculator reviews
2,009b7e66-ef69-49fc-87c8-9d40d53e0e33,Basic calculator with check and recheck feature,How to use a Casio calculator
3,009b7e66-ef69-49fc-87c8-9d40d53e0e33,Casio DJ-120D Plus price in Bangladesh,Casio calculator accessories
4,009b7e66-ef69-49fc-87c8-9d40d53e0e33,Casio DJ-120D Plus features,What is the best calculator brand


In [8]:
product_df = pd.read_csv("./datasets/final_5000_products.csv")
product_df.head(5)

Unnamed: 0,title,price,description,id
0,Casio DJ-120D Plus Check & Recheck Basic Calcu...,1305,,009b7e66-ef69-49fc-87c8-9d40d53e0e33
1,Colorful CN600 PRO 1TB M.2 NVMe SSD,7300,Capacity: 1TB\nFlash Type: 3D NAND\nInterface:...,7bd5da56-89e9-4b68-92e2-cd31f0578bcb
2,Anker Soundcore Space One Foldable Over-Ear Bl...,7990,Frequency Range: 20Hz-20KHz\nInput Jack: AUX C...,3c7d8f65-a7b7-47cd-b808-d6e8c445ca69
3,"Smart SEL-50V24K 50"" 4K Voice Control Android ...",51900,Display Type: LED\nScreen Size: 50 Inch\nResol...,212bc014-cec5-4bc6-ad82-2591098ab808
4,EZVIZ H3c 3MP Wi-Fi Smart Home Outdoor Securit...,4324,Image Sensor: 1/2.7”Progressive Scan CMOS\nEff...,617e0e00-cfd2-4465-b46f-9537476327a4


In [9]:
merged_df = (
    pd.merge(product_df, query_df, left_on="id", right_on="product_id")
    .drop(columns=["id", "product_id"])
    .reset_index(drop=True)
)
merged_df["anchor"] = merged_df.apply(
    lambda row: format_product_details(
        name=row["title"], description=row["description"], price=row["price"]
    ),
    axis=1,
)
merged_df = merged_df.drop(columns=["title", "price", "description"]).rename(columns={"relevant_query": "positive", "irrelevant_query": "negative"})
merged_df = merged_df[["anchor", "positive", "negative"]]
print(merged_df.shape)
merged_df.tail(5)

(24434, 3)


Unnamed: 0,anchor,positive,negative
24429,Name: Redragon Storm Pro M808-KS RGB USB 2.4G ...,Redragon Storm Pro M808-KS RGB gaming mouse,Best laptops under 50000 taka
24430,Name: Redragon Storm Pro M808-KS RGB USB 2.4G ...,7 programmable buttons wireless gaming mouse,Wireless earbuds with long battery life
24431,Name: Redragon Storm Pro M808-KS RGB USB 2.4G ...,5 DPI indicator optical sensor,Lightweight backpacks for school
24432,Name: Redragon Storm Pro M808-KS RGB USB 2.4G ...,40G acceleration gaming mouse,Top-rated smartphones in India
24433,Name: Redragon Storm Pro M808-KS RGB USB 2.4G ...,Black color options,Discounted deals on kitchen appliances


## Dataset split


In [10]:
train_df, val_df = train_test_split(
    merged_df, test_size=0.10, random_state=42, shuffle=True
)
train_df, test_df = train_test_split(
    train_df, test_size=0.20, random_state=42, shuffle=True
)

train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)
val_df.reset_index(inplace=True, drop=True)

train_df.shape, val_df.shape, test_df.shape

((17592, 3), (2444, 3), (4398, 3))

In [11]:
train_df.to_csv("./datasets/train.csv", index=False)
test_df.to_csv("./datasets/test.csv", index=False)
val_df.to_csv("./datasets/val.csv", index=False)