In [None]:
# This is a Jupyter notebook intended at preprocessing the data and generate vector embeddings of our data
# I have run this notebook on Google Colab GPU
# Although this run can be made on the CPU as well
# I highly recommend to run it on a GPU machine for a faster computation
# The npy file generated from this file was later stored in the current location to use in other files

In [None]:
# Necessary installations
!pip install qdrant-client
!pip install sentence-transformers

In [None]:
# Necessary download and imports
from sentence_transformers import SentenceTransformer
import numpy as np
import re
import pandas as pd
from tqdm.notebook import tqdm
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
 
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# sentence-transformer model initialization
model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")

In [None]:
# dataset read
df = pd.read_csv('bigBasketProducts.csv')
df.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [None]:
# dataset preprocess
df.fillna({'rating': 0}, inplace=True)
df.fillna("NA", inplace=True)
df = df.astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         27555 non-null  object
 1   product       27555 non-null  object
 2   category      27555 non-null  object
 3   sub_category  27555 non-null  object
 4   brand         27555 non-null  object
 5   sale_price    27555 non-null  object
 6   market_price  27555 non-null  object
 7   type          27555 non-null  object
 8   rating        27555 non-null  object
 9   description   27555 non-null  object
dtypes: object(10)
memory usage: 2.1+ MB


In [None]:
# function to remove special characters
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    return text

#function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# preprocessing input data by cleaning it before training 
df['product'] = df['product'].apply(clean_text)
df['category'] = df['category'].apply(clean_text)
df['sub_category'] = df['sub_category'].apply(clean_text)
df['brand'] = df['brand'].apply(clean_text)
df['type'] = df['type'].apply(clean_text)
df['clean_description'] = df['description'].apply(clean_text)
df.head()


Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,clean_description
0,1,garlic oil vegetarian capsule mg,beauty hygiene,hair care,sri sri ayurveda,220.0,220.0,hair oil serum,4.1,This Product contains Garlic Oil that is known...,this product contains garlic oil that is known...
1,2,water bottle orange,kitchen garden pets,storage accessories,mastercook,180.0,180.0,water fridge bottles,2.3,"Each product is microwave safe (without lid), ...",each product is microwave safe without lid ref...
2,3,brass angle deep plain no,cleaning household,pooja needs,trm,119.0,250.0,lamp lamp oil,3.4,"A perfect gift for all occasions, be it your m...",a perfect gift for all occasions be it your mo...
3,4,cereal flip lid containerstorage jar assorted...,cleaning household,bins bathroom ware,nakoda,149.0,176.0,laundry storage baskets,3.7,Multipurpose container with an attractive desi...,multipurpose container with an attractive desi...
4,5,creme soft soap for hands body,beauty hygiene,bath hand wash,nivea,162.0,162.0,bathing bars soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,nivea creme soft soap gives your skin the best...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  27555 non-null  object 
 1   product                27555 non-null  object 
 2   category               27555 non-null  object 
 3   sub_category           27555 non-null  object 
 4   brand                  27555 non-null  object 
 5   sale_price             27555 non-null  object 
 6   market_price           27555 non-null  object 
 7   type                   27555 non-null  object 
 8   rating                 27555 non-null  object 
 9   description            27555 non-null  object 
 10  clean_description      27555 non-null  object 
 11  my_description         27555 non-null  object 
 12  description_sentiment  27555 non-null  float64
dtypes: float64(1), object(12)
memory usage: 2.7+ MB


In [None]:
df['my_description'] = df['clean_description'].apply(remove_stopwords)
df['description_sentiment'] = df['clean_description'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,clean_description,my_description,description_sentiment
0,1,garlic oil vegetarian capsule mg,beauty hygiene,hair care,sri sri ayurveda,220.0,220.0,hair oil serum,4.1,This Product contains Garlic Oil that is known...,this product contains garlic oil that is known...,product contains garlic oil known help proper ...,0.166667
1,2,water bottle orange,kitchen garden pets,storage accessories,mastercook,180.0,180.0,water fridge bottles,2.3,"Each product is microwave safe (without lid), ...",each product is microwave safe without lid ref...,product microwave safe without lid refrigerato...,0.45
2,3,brass angle deep plain no,cleaning household,pooja needs,trm,119.0,250.0,lamp lamp oil,3.4,"A perfect gift for all occasions, be it your m...",a perfect gift for all occasions be it your mo...,perfect gift occasions mother sister inlaws bo...,0.522619
3,4,cereal flip lid containerstorage jar assorted...,cleaning household,bins bathroom ware,nakoda,149.0,176.0,laundry storage baskets,3.7,Multipurpose container with an attractive desi...,multipurpose container with an attractive desi...,multipurpose container attractive design made ...,0.285776
4,5,creme soft soap for hands body,beauty hygiene,bath hand wash,nivea,162.0,162.0,bathing bars soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,nivea creme soft soap gives your skin the best...,nivea creme soft soap gives skin best care mus...,0.383333


In [None]:
# preprocessed data file
df.to_csv("preprocessed_bb_products.csv",index = False)

In [None]:
# Creating vector encodings for our preprocessed data using our sentence-transformer model
vectors = model.encode([
    str(row.product) + " " + str(row.category) + " " + str(row.sub_category) + " " + str(row.type) + " " + str(row.brand) + " " + str(row.my_description)
    for row in df.itertuples()
], show_progress_bar=True)

In [None]:
vectors.shape

(27555, 384)

In [None]:
# saving our generated vector file
np.save('bb_chaabi_vectors.npy', vectors, allow_pickle=False)