In [1]:
import pandas as pd
import json
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [9]:
# Load the JSON data
with open('collected_data.json', 'r') as file:
    data = json.load(file)

In [3]:
# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alokkumar.das\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alokkumar.das\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alokkumar.das\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# data

In [10]:
# Extract the relevant data part
data_part = data

# Flatten the nested JSON structure using json_normalize
data = pd.json_normalize(data_part, record_path=['searchInfo'], meta=['customerEmailId'])

# Rename columns as needed
data.rename(columns={'customerEmailId': 'customerEmailId'}, inplace=True)

In [11]:
data.head()

Unnamed: 0,product_name,sku,product_id,description,searchDate,customerEmailId
0,Jupiter All-Weather Trainer,MJ06,398,<p>Inclement climate be damned. With your brea...,"Thursday, September 14, 2023 at 7:20:03 AM",rytest@gmail.com
1,Troy Yoga Short,MSH09,989,"<p>The versatile, all-purpose Troy Yoga Short ...","Thursday, September 14, 2023 at 7:20:40 AM",rytest@gmail.com
2,Aether Gym Pant,MP11,867,<p>The Aether Gym Pant is built for the studio...,"Thursday, September 14, 2023 at 7:48:10 PM",jh@gmail.con
3,Clamber Watch,24-WG03,43,<p>Keep track of time on the treadmill or trai...,"Thursday, September 14, 2023 at 7:48:40 PM",jh@gmail.con
4,Orion Two-Tone Fitted Jacket,MJ07,318,"<p>While you're getting fit, you need a fitted...","Thursday, September 14, 2023 at 7:49:32 PM",jh@gmail.con


In [12]:
data['logs']=data['product_name']+data['sku']+data['description']

In [13]:
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', ' ', text).lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [14]:
# Apply preprocessing to the 'overview' column
data['logs'] = data['logs'].apply(preprocess_text)

In [15]:
data

Unnamed: 0,product_name,sku,product_id,description,searchDate,customerEmailId,logs
0,Jupiter All-Weather Trainer,MJ06,398,<p>Inclement climate be damned. With your brea...,"Thursday, September 14, 2023 at 7:20:03 AM",rytest@gmail.com,jupiter weather trainer mj06 inclement climate...
1,Troy Yoga Short,MSH09,989,"<p>The versatile, all-purpose Troy Yoga Short ...","Thursday, September 14, 2023 at 7:20:40 AM",rytest@gmail.com,troy yoga shortmsh09 versatile purpose troy yo...
2,Aether Gym Pant,MP11,867,<p>The Aether Gym Pant is built for the studio...,"Thursday, September 14, 2023 at 7:48:10 PM",jh@gmail.con,aether gym pant mp11 aether gym pant built stu...
3,Clamber Watch,24-WG03,43,<p>Keep track of time on the treadmill or trai...,"Thursday, September 14, 2023 at 7:48:40 PM",jh@gmail.con,clamber watch24 wg03 keep track time treadmill...
4,Orion Two-Tone Fitted Jacket,MJ07,318,"<p>While you're getting fit, you need a fitted...","Thursday, September 14, 2023 at 7:49:32 PM",jh@gmail.con,orion two tone fitted jacketmj07 getting fit n...
...,...,...,...,...,...,...,...
143,Proteus Fitness Jackshirt,MJ12,430,"<p>Part jacket, part shirt, the Proteus Fitnes...","Tuesday, September 19, 2023 at 10:40:33 PM",alokkumar123@gmail.com,proteus fitness jackshirtmj12 part jacket part...
144,Montana Wind Jacket,MJ03,414,<p>Light-as-a-feather wind protection for runn...,"Tuesday, September 19, 2023 at 10:53:56 PM",bakir@gmail.com,montana wind jacketmj03 light feather wind pro...
145,Mithra Warmup Pant,MP06,802,<p>When you're not sure you're up to the weath...,"Tuesday, September 19, 2023 at 10:55:43 PM",bakir@gmail.com,mithra warmup pantmp06 sure weather mithra war...
146,Sprite Yoga Companion Kit,24-WG080,45,<p>A well-rounded yoga workout takes more than...,"Wednesday, September 20, 2023 at 4:23:38 AM",armaan123@gmail.com,sprite yoga companion kit24 wg080 well rounded...


In [16]:
# Create a purchase history dataframe
purchase_history = data.groupby(['customerEmailId', 'product_name']).size().reset_index(name='purchase_count')

In [17]:
purchase_history

Unnamed: 0,customerEmailId,product_name,purchase_count
0,adeshtest@rysun.com,Geo Insulated Jogging Pant,1
1,adeshtest@rysun.com,Lando Gym Jacket,1
2,adeshtest@rysun.com,Pierce Gym Short,1
3,adeshtest@rysun.com,Strive Shoulder Pack,1
4,adeshtest@rysun.com,Tristan Endurance Tank,1
...,...,...,...
142,vinay@gmail.com,Sinbad Fitness Tank,1
143,vishnutest1@rysun.com,Hero Hoodie,1
144,vishrutitest27@rysun.com,Affirm Water Bottle,1
145,vishrutitest27@rysun.com,Circe Hooded Ice Fleece,1


In [18]:
# Define a TF-IDF vectorizer for product descriptions (assuming you have product descriptions in your data)
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the product descriptions
product_descriptions = data['logs'].fillna('')
tfidf_vectorizer.fit(product_descriptions)

In [23]:
# Save tfidf_matrix to a pickle file
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# Save purchase_history to a pickle file
with open('purchase_history.pkl', 'wb') as file:
    pickle.dump(purchase_history, file)