In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv("Data/SkinCare-product.csv")

In [3]:
df = df[~df['title'].isnull()]

In [4]:
df_drugs = pd.read_csv("Data/drugs.csv")

In [5]:
def tokenize_text(text):
    return word_tokenize(text)

def lowercase_text(text):
    return text.lower()

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def handle_special_characters(text):
    text = re.sub(r'http\S+|www\S+', 'URL', text)
    text = re.sub(r'#\w+', 'HASHTAG', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess_text(column):
    column = column.apply(lowercase_text)
    column = column.apply(handle_special_characters)
    column = column.apply(remove_punctuation)
    column = column.apply(tokenize_text)
    column = column.apply(remove_stopwords)
    column = column.apply(lemmatize_text)
    return column


In [6]:
df_processed = pd.DataFrame(columns=df.columns)

In [7]:
df_processed.drop(columns=df_processed.columns[0], inplace=True)

In [8]:
df.columns

Index(['_id', 'category', 'link', 'image_url', 'title', 'number_ratings',
       'rating_element', 'description', 'price', 'qty', 'highlights'],
      dtype='object')

In [9]:
df["highlights"]=df["highlights"].fillna("")

In [10]:
df["description"]=df["description"].fillna("")

In [11]:
df_processed = df.copy()

In [12]:
df_processed["title"] = preprocess_text(df["title"])

In [13]:
df_processed['title']

1                                       [acnemoist, cream]
4        [foxtale, combo, pack, hydrating, cleanser, 10...
8                                          [acnestar, gel]
9                                          [acnestar, gel]
10                                         [ga, 12, cream]
                               ...                        
11348                          [balu, herbal, rose, water]
11349    [earthy, essential, combo, pack, neem, tea, tr...
11350                     [aromamusk, witch, hazel, toner]
11351    [parampara, ayurved, rose, water, premium, 290ml]
11352      [bio, beauty, astringent, aloe, vera, cucumber]
Name: title, Length: 11061, dtype: object

In [14]:
df_processed["highlights"] = preprocess_text(df["highlights"])

In [15]:
df_processed['highlights']

1                                                       []
4        [combo, pack, useful, hydrating, moisturising,...
8        [help, clear, extra, sebum, impuritiesassists,...
9        [help, clear, extra, sebum, impuritiesassists,...
10       [gentle, formula, clean, skin, remove, excess,...
                               ...                        
11348    [maintain, ph, balance, nourishes, skin, deepl...
11349    [sulphate, freecertified, ifra, perfumecruelty...
11350    [beneficial, skinchemical, paraben, alcoholfre...
11351    [refreshes, skinmakes, skin, softfree, preserv...
11352    [promotes, naturally, fresh, skinsuitable, ski...
Name: highlights, Length: 11061, dtype: object

In [16]:
df_processed["description"] = preprocess_text(df["description"])

In [17]:
df_processed["description"]

1        [acnemoist, cream, specially, formulated, oil,...
4        [foxtale, combo, pack, hydrating, cleanser, 10...
8        [acnestar, gel, useful, various, reason, serum...
9        [acnestar, gel, useful, various, reason, serum...
10       [ga, 12, cream, specialized, oil, free, formul...
                               ...                        
11348    [balu, herbal, rose, water, excellent, skin, c...
11349    [earthy, essential, neem, tea, tree, face, was...
11350    [aromamusk, witch, hazel, tonerthis, face, ton...
11351    [parampara, ayurved, rose, water, premium, 290...
11352    [bio, beauty, astringent, designed, promote, c...
Name: description, Length: 11061, dtype: object

In [18]:
df_processed = df_processed[~df_processed['title'].isnull()]

In [19]:
df_processed.head()

Unnamed: 0,_id,category,link,image_url,title,number_ratings,rating_element,description,price,qty,highlights
1,64ca8452a945cf043782f942,acne-pimples,https://www.1mg.com/otc/acnemoist-cream-otc340541,"https://onemg.gumlet.io/l_watermark_346,w_690,...","[acnemoist, cream]",484 Ratings & 110 Reviews,4.4,"[acnemoist, cream, specially, formulated, oil,...",₹319,60 gm Cream,[]
4,64ca8452a945cf043782f945,acne-pimples,https://www.1mg.com/otc/foxtale-combo-pack-of-...,"https://onemg.gumlet.io/l_watermark_346,w_690,...","[foxtale, combo, pack, hydrating, cleanser, 10...",,,"[foxtale, combo, pack, hydrating, cleanser, 10...",₹994,3 bottles,"[combo, pack, useful, hydrating, moisturising,..."
8,64ca8452a945cf043782f949,acne-pimples,https://www.1mg.com/otc/acnestar-gel-otc358988,"https://onemg.gumlet.io/l_watermark_346,w_690,...","[acnestar, gel]",270 Ratings & 25 Reviews,4.3,"[acnestar, gel, useful, various, reason, serum...",₹98,22 gm Gel,"[help, clear, extra, sebum, impuritiesassists,..."
9,64ca8452a945cf043782f94a,acne-pimples,https://www.1mg.com/otc/acnestar-gel-otc358988,"https://onemg.gumlet.io/l_watermark_346,w_690,...","[acnestar, gel]",270 Ratings & 25 Reviews,4.3,"[acnestar, gel, useful, various, reason, serum...",₹98,22 gm Gel,"[help, clear, extra, sebum, impuritiesassists,..."
10,64ca8452a945cf043782f94b,acne-pimples,https://www.1mg.com/otc/ga-12-cream-otc415193,"https://onemg.gumlet.io/l_watermark_346,w_690,...","[ga, 12, cream]",141 Ratings & 19 Reviews,4.3,"[ga, 12, cream, specialized, oil, free, formul...",₹164,30 gm Cream,"[gentle, formula, clean, skin, remove, excess,..."


In [20]:
df_processed["category"] = df.category

In [21]:
df_processed['highlights'] = df_processed['highlights'].apply(lambda x: ' '.join(x))
df_processed['description'] = df_processed['description'].apply(lambda x: ' '.join(x))
df_processed['title'] = df_processed['title'].apply(lambda x: ' '.join(x))

In [22]:
df_processed.to_csv("pre-processed-data-mongodb.csv")

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df_processed['text'] = df_processed['description'] + df_processed['highlights'] + df_processed['title']

vectorizer = TfidfVectorizer()
description_vectors = vectorizer.fit_transform(df_processed['text'])

user_description = "What products offer both sun protection and anti-aging benefits for maintaining youthful skin?"

user_description_vector = vectorizer.transform([user_description])

similarity_scores = cosine_similarity(user_description_vector, description_vectors).flatten()

df_processed['similarity_score'] = similarity_scores
df_sorted = df_processed.sort_values(by='similarity_score', ascending=False)

recommendations = df_sorted[['_id','description']].head(5)

print(recommendations._id)

3638     64ca8454a945cf0437830777
3776     64ca8454a945cf0437830801
3145     64ca8454a945cf043783058a
11028    64ca8459a945cf0437832455
6888     64ca8456a945cf0437831429
Name: _id, dtype: object


In [27]:
for i in df.loc[[5464, 5590, 4406, 4249, 3537]]['link']:
    print(i)

https://www.1mg.com/otc/acwis-gs-face-wash-otc517232
https://www.1mg.com/otc/beardo-activated-charcoal-face-wash-otc689062
https://www.1mg.com/otc/jovees-papaya-face-wash-otc732469
https://www.1mg.com/otc/nivea-men-dark-spot-reduction-face-wash-otc323138
https://www.1mg.com/otc/kazima-anti-aging-facial-serum-otc567162


In [32]:
ground_truth = {
    "Which skincare products are effective in treating and preventing acne breakouts?": [
        "64ca8459a945cf04378320a2",
        "64ca8457a945cf0437831566",
        "64ca8457a945cf043783154c",
        "64ca8458a945cf0437831f79",
        "64ca8455a945cf0437830af2"
    ],
    "What are the best moisturizing products to hydrate and nourish dry skin?": [
        "64ca8455a945cf0437830ff0",
        "64ca8455a945cf04378309d8",
        "64ca8455a945cf04378308fc",
        "64ca8455a945cf0437830af2",
        "64ca8455a945cf0437830b65"
    ],
    "Which skincare products are gentle and suitable for sensitive skin types?": [
        "64ca8453a945cf043782fdc8",
        "64ca8455a945cf0437830fe8",
        "64ca8453a945cf043782fde3",
        "64ca8454a945cf04378307bb",
        "64ca8459a945cf04378320a2"
    ],
    "What are the recommended products to address hyperpigmentation and achieve a more even complexion?": [
        "64ca8454a945cf0437830779",
        "64ca8455a945cf04378308fc",
        "64ca8456a945cf04378313fc",
        "64ca8455a945cf0437830a00",
        "64ca8455a945cf0437830af2"
    ],
    "Which skincare products are ideal for balancing combination skin and addressing both oily and dry areas?": [
        "64ca8459a945cf0437832519",
        "64ca8455a945cf0437830fa1",
        "64ca8455a945cf0437830990",
        "64ca8452a945cf043782fcac",
        "64ca8459a945cf0437832583"
    ]
}

In [37]:
predicted_values = {
    "Which skincare products are effective in treating and preventing acne breakouts?": [
        "64ca8459a945cf04378320a2",
        "64ca8457a945cf0437831566",
        "64ca8457a945cf043783154c",
        "64ca8458a945cf0437831f79",
        "64ca8458a945cf0437831fc6"
    ],
    "What are the best moisturizing products to hydrate and nourish dry skin?": [
        "64ca8455a945cf0437830ff0",
        "64ca8455a945cf04378309d8",
        "64ca8455a945cf0437830a00",
        "64ca8455a945cf0437830af2",
        "64ca8455a945cf0437830b65"
    ],
    "Which skincare products are gentle and suitable for sensitive skin types?": [
        "64ca8452a945cf043782fce7",
        "64ca8453a945cf043782fdc8",
        "64ca8455a945cf0437830fe8",
        "64ca8453a945cf043782fde3",
        "64ca8454a945cf04378307bb"
    ],
    "What are the recommended products to address hyperpigmentation and achieve a more even complexion?": [
        "64ca8455a945cf0437830fdb",
        "64ca8455a945cf0437830a60",
        "64ca8454a945cf0437830779",
        "64ca8455a945cf04378308fc",
        "64ca8456a945cf04378313fc"
    ],
    "Which skincare products are ideal for balancing combination skin and addressing both oily and dry areas?": [
        "64ca8459a945cf0437832519",
        "64ca8455a945cf0437830fa1",
        "64ca8455a945cf0437830990",
        "64ca8452a945cf043782fcac",
        "64ca8459a945cf0437832583"
    ]
}


In [34]:
def precision_at_k(true_values, predicted_values, k):
    assert k <= len(predicted_values)
    top_k_predictions = predicted_values[:k]
    relevant_items = set(true_values)
    num_relevant_in_top_k = sum([1 for item in top_k_predictions if item in relevant_items])
    
    return num_relevant_in_top_k / k

In [35]:
def recall_at_k(true_values, predicted_values, k):
    assert k <= len(predicted_values)
    top_k_predictions = predicted_values[:k]
    relevant_items = set(true_values)
    num_relevant_in_top_k = sum([1 for item in top_k_predictions if item in relevant_items])
    
    return num_relevant_in_top_k / len(relevant_items)

In [45]:
precision_values = [precision_at_k(ground_truth[question], predicted_values[question], 5) for question in ground_truth]
recall_values = [recall_at_k(ground_truth[question], predicted_values[question], 5) for question in ground_truth]

sum(precision_values)/5, sum(recall_values)/5

(0.8, 0.8)