In [1]:
import pandas as pd
from urllib.parse import unquote_plus
from sentence_transformers import SentenceTransformer, util
from collections import defaultdict
import torch, string, spacy, pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
def process_text (s):
    s = s.translate(str.maketrans('', '', string.punctuation))
    doc = nlp(s)
    lm_txt = ' '.join([token.lemma_ for token in doc]).lower()
    return lm_txt

In [4]:
atome_merchants = pd.read_json('../data/ecommerce/atome/merchants_info.json', lines=True)
atome_merchants['merchant_name'] = atome_merchants['atome_page'].apply(lambda x: unquote_plus(x.split('/')[-1])).str.lower()
atome_merchants = atome_merchants.drop_duplicates(subset='merchant_name').reset_index(drop=True)
atome_merchants['description'] = atome_merchants.description.apply(lambda s: process_text(s))

atome_merchants.head(3)

Unnamed: 0,atome_page,merchant_site,description,merchant_name
0,https://www.atome.sg/paylater-merchants/ZARA,,fashion retail chain zara have come a long way...,zara
1,https://www.atome.sg/paylater-merchants/1gravity,,1gravity offer bespoke hair care for modern ma...,1gravity
2,https://www.atome.sg/paylater-merchants/3arts+...,,3art pottery be a pottery studio base in singa...,3arts pottery


In [5]:
atome_merchants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823 entries, 0 to 822
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   atome_page     823 non-null    object
 1   merchant_site  477 non-null    object
 2   description    823 non-null    object
 3   merchant_name  823 non-null    object
dtypes: object(4)
memory usage: 25.8+ KB


In [6]:
def ngrams(tokens, MIN_N, MAX_N):
    n_tokens = len(tokens)
    for i in range(n_tokens):
        for j in range(i+MIN_N, min(n_tokens, i+MAX_N)+1):
            yield tokens[i:j]

In [11]:
corpus = atome_merchants.description.tolist()
inverted_index = defaultdict(set)

for docid, c in enumerate(corpus):
    for word in c.split():
        inverted_index[word].add(docid)

In [12]:
# with open('../data/atome_merchants_inverted_index.pkl', 'wb') as handle:
#     pickle.dump(inverted_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../data/atome_merchants_inverted_index.pkl', 'rb') as handle:
    inverted_index = pickle.load(handle)

In [14]:
def process_and_search(query):
    matched_documents = set()
    query = process_text(query)
    for word in query.split():
        matches = inverted_index.get(word)
        if matches:
            matched_documents |= matches
    return matched_documents

In [15]:
descrips = atome_merchants.description.tolist()
embedder = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
descrip_embeddings = embedder.encode(descrips, convert_to_tensor=True)
docs, dim = descrip_embeddings.shape
print("Number of Document vectors: {l}, Dimension: {d}".format(l=docs, d=dim))

Number of Document vectors: 823, Dimension: 768


In [18]:
# torch.save(descrip_embeddings, '../data/atome_descrip_embeddings.pt')

# descrip_embeddings = torch.load('../data/atome_descrip_embeddings.pt')

In [22]:
query="travel"

In [23]:
merchant_lookup = {i:merchant for i, merchant in enumerate(atome_merchants.merchant_name.tolist())}

In [25]:
# with open('../data/atome_merchant_lookup.pkl', 'wb') as handle:
#     pickle.dump(merchant_lookup, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('../data/atome_merchant_lookup.pkl', 'rb') as handle:
#     merchant_lookup = pickle.load(handle)

In [26]:
match_indx = list(process_and_search(query))
query_embedding = embedder.encode(query, convert_to_tensor=True)
search_embeddings = descrip_embeddings[match_indx]
if len(match_indx) > 0:
    cos_scores = util.pytorch_cos_sim(search_embeddings, descrip_embeddings)[0]
    idx_scores = [(i,s+1) if i in match_indx else (i,s) for i,s in enumerate(cos_scores)]
else: 
    cos_scores = util.pytorch_cos_sim(query_embedding, search_embeddings)[0]
    idx_scores = [(i,s) for i,s in zip(match_indx, cos_scores)]
idx_scores = sorted(idx_scores, key=lambda tup: tup[1], reverse=True)

print("\n\n======================================================================\n\n")
print("Query:", query)
print("\nMerchants:\n")

for idx, score in idx_scores:
    if score > idx_scores[0][1]/3:
        print('\t', merchant_lookup[idx], "(Score: {:.4f})".format(score))
print("======================================================================")





Query: travel

Merchants:

	 furama riverfront (Score: 2.0000)
	 agoda (Score: 1.4845)
	 getit by changi recommends (Score: 1.4533)
	 klook travel (Score: 1.3896)
	 easybook (Score: 1.3658)
	 samsonite (Score: 1.3350)
	 american tourister (Score: 1.3005)
	 the novus lab (Score: 1.2950)
	 coldwear (Score: 1.2871)
	 houze (Score: 1.2734)
	 crocodile (Score: 1.2622)
	 lipault (Score: 1.2410)
	 boheme (Score: 1.2003)


In [49]:
atome_merchants[atome_merchants.merchant_name == 'agoda'].iloc[18][]

Unnamed: 0,atome_page,merchant_site,description,merchant_name
18,https://www.atome.sg/paylater-merchants/agoda,https://www.agoda.com,headquarterd in singapore agoda be one of the ...,agoda
