## Building Indexes

In [95]:
import document_preprocessor
import indexing 
import os
import importlib
import ranker
import json

In [96]:
importlib.reload(indexing) 

<module 'indexing' from '/Users/newuser/Documents/Nandana/UofM/SI650/si650-courseproject/indexing.py'>

In [97]:
DATA_PATH = 'data/' 
CACHE_PATH = '__cache__/' 

STOPWORD_PATH = DATA_PATH + 'stopwords.txt'
PRODUCT_DATASET_PATH = DATA_PATH + 'inci_products_new.jsonl'
FUNCTION_DATASET_PATH = DATA_PATH + 'inci_ingredient_functions.jsonl'

In [98]:
# Read in stopwords
print('Loading stopwords...')
stopwords = set()
with open(STOPWORD_PATH, 'r') as f:
    for line in f:
        stopwords.add(line.strip())
print('Stopwords loaded!')

Loading stopwords...
Stopwords loaded!


In [100]:
print('Loading indexes...')
preprocessor = document_preprocessor.RegexTokenizer('\w+')

# Create cache directory if it doesn't exist
if not os.path.exists(CACHE_PATH):
    os.mkdir(CACHE_PATH)

# Check if the inverted index exists
index_exist = False
for filename in os.listdir(CACHE_PATH):
    if filename.endswith(".json"):
        index_exist = True
        break

if index_exist:
    product_index = indexing.BasicInvertedIndex()
    function_index = indexing.BasicInvertedIndex()
    product_index.load("product", CACHE_PATH)
    function_index.load("function", CACHE_PATH)
else:
    product_index, function_index = indexing.Indexer.create_index(
        indexing.IndexType.BasicInvertedIndex,PRODUCT_DATASET_PATH, FUNCTION_DATASET_PATH, preprocessor,
        stopwords, 0, max_docs=-1
    )
    product_index.save("product", CACHE_PATH)
    function_index.save("function", CACHE_PATH)

print("Statistics of Product Index")
print(product_index.get_statistics())

Loading indexes...
Statistics of Product Index
{'vocab': Counter({'null': 39182, 'skin': 4420, 'hair': 1395, 'cream': 1071, 'oil': 1031, 'serum': 918, 'mask': 795, 's': 610, 'face': 593, 'gel': 580, 'acne': 579, 'dry': 550, 'shampoo': 548, 'natural': 547, 'body': 534, 'cleanser': 532, 'helps': 528, 'acid': 500, 'formula': 492, 'free': 477, 'treatment': 473, 'moisture': 459, 'lip': 404, 'vitamin': 403, 'sensitive': 393, 'smooth': 386, 'water': 374, 'pores': 342, 'conditioner': 333, 'ingredients': 332, 'cleansing': 332, 'gentle': 325, 'extract': 318, 'care': 308, 'formulated': 302, 'lotion': 298, 'sunscreen': 294, 'wash': 291, 'butter': 289, 'appearance': 285, 'anti': 285, 'toner': 284, 'balm': 283, 'soft': 283, 'hydration': 277, 'moisturizing': 277, 'hydrating': 268, 'c': 268, 'eye': 265, 'moisturizer': 257, 'glow': 257, 'texture': 253, 'facial': 248, 'rich': 246, 'aloe': 238, 'barrier': 236, 'color': 233, 'spray': 225, 'tone': 220, 'makeup': 218, 'organic': 217, 'nourishing': 217, 'agi

## Validating Indexes with BM25

In [101]:
raw_text_dict = {}
with open(PRODUCT_DATASET_PATH, "r") as f:
    # go through each json line
    for line in f:
        eachdoc = json.loads(line)
        raw_text_dict[eachdoc['docid']] = eachdoc['product_url']

In [102]:
bm25_scorer = ranker.BM25(product_index)
bm25_ranker = ranker.Ranker(
    product_index, preprocessor, stopwords,
    bm25_scorer, raw_text_dict=raw_text_dict)

In [106]:
query_str = "soothing retinol gel"
results = bm25_ranker.query(query_str)

if len(results) >= 10:
    results = results[:10]

print(f"For Query Term : {query_str}, Results are:")
for docid,score in results:
    print(raw_text_dict[docid])

For Query Term : soothing retinol gel, Results are:
https://incidecoder.com/products/no7-pure-retinol-post-retinol-soother
https://incidecoder.com/products/farmstay-snail-soothing-gel
https://incidecoder.com/products/twelve-beauty-plant-perfection-gel-serum
https://incidecoder.com/products/aloe-pura-organic-aloe-vera-gel
https://incidecoder.com/products/face-republic-cica-soothing-gel-face-body
https://incidecoder.com/products/dr-bio-eco-soothing-cream
https://incidecoder.com/products/ocean-potion-skincare-aloe-gel
https://incidecoder.com/products/neutrogena-adapalene-gel
https://incidecoder.com/products/boots-no7-pure-retinol-post-retinol-soother
https://incidecoder.com/products/babaria-retinol
