In [8]:
cd "H:\My Drive\projects\table-qa-in-ecommerce"

H:\My Drive\projects\table-qa-in-ecommerce


In [3]:
!pip install rank_bm25 -q

In [6]:
!pip install nltk -q

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NamFam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [19]:
import json
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

# Function to load data from a JSON Lines file
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Function to preprocess and tokenize text
def tokenize(text):
    return word_tokenize(text.lower())

# Create the BM25 index from the product data
def create_index(data):
    corpus = []
    product_ids = []
    for product in data:
        # Concatenate all textual content that should be searchable
        content = ' '.join([product.get('product_id', ''),
                            product.get('tên sản phẩm', ''),
                            product.get('thông tin chung', {}).get('Chất liệu', ''),
                            str(product.get('thông số kỹ thuật', {}).get('Màn hình', {}))])
        corpus.append(tokenize(content))
        product_ids.append(product['product_id'])
    bm25 = BM25Okapi(corpus)
    return bm25, product_ids

# Query the index and retrieve products
def query_index(bm25, product_ids, query, top_n=10):
    tokenized_query = tokenize(query)
    scores = bm25.get_scores(tokenized_query)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [product_ids[i] for i in top_indexes]

# Load the data (replace 'path_to_your_data.jsonl' with the actual file path)
data = load_data('data/tgdd_data/products/phone_products_20240406_203410_cleaned.json')

# Create the index
bm25, product_ids = create_index(data)




In [31]:
# Example query
query = "Điện thoại nào có điểm đánh giá trung bình lớn hơn 3.6"
results = query_index(bm25, product_ids, query)
print(len(results))
print("Top matching products for your query:", results)

10
Top matching products for your query: ['phone-nokia-8210-4g', 'phone-itel-it9210', 'phone-mobell-m331', 'phone-itel-it9010', 'phone-mobell-m239', 'phone-nokia-110-4g-pro', 'phone-nokia-105-4g-pro', 'phone-nokia-105-4g', 'phone-mobell-m539', 'phone-mobell-f209']


In [3]:
import math

class BM25:
    def __init__(self, corpus, k1=1.5, b=0.75):
        self.corpus = corpus
        self.k1 = k1
        self.b = b
        self.avgdl = sum(len(doc) for doc in corpus) / len(corpus)
        self.idf = {}
        self.initialize()

    def initialize(self):
        doc_count = len(self.corpus)
        for doc in self.corpus:
            for word in set(doc):
                if word not in self.idf:
                    doc_freq = sum(1 for doc in self.corpus if word in doc)
                    self.idf[word] = math.log((doc_count - doc_freq + 0.5) / (doc_freq + 0.5) + 1)

    def score(self, query, document):
        score = 0
        doc_len = len(document)
        for word in query:
            if word not in self.idf:
                continue
            tf = document.count(word)
            score += (self.idf[word] * tf * (self.k1 + 1)) / (tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl))
        return score

# Example usage:
corpus = [
    ["hello", "world", "hello", "bm25"],
    ["bm25", "algorithm", "example"],
    ["implementation", "of", "bm25"],
]

bm25 = BM25(corpus)

query = ["bm25", "algorithm"]
for i, doc in enumerate(corpus):
    print(f"Score for document {i+1}: {bm25.score(query, doc)}")


Score for document 1: 0.12250586479313998
Score for document 2: 1.1668697860065433
Score for document 3: 0.13982344777436917
