In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pavankumarkotapally/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pavankumarkotapally/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
df=pd.read_csv("products.csv")
df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

# Apply the cleaning function to the product_name column
df['cleaned_product_name'] = df['product_name'].apply(clean_text)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
def find_similar_products(input_product_name, df, top_n=10):
    # Clean the input product name
    cleaned_input = clean_text(input_product_name)  # Use your defined clean_text function

    # Create a DataFrame for the cleaned input to concatenate with the original df
    input_df = pd.DataFrame({'cleaned_product_name': [cleaned_input]})

    # Use pd.concat to combine the original df with the new input_df
    temp_df = pd.concat([df, input_df], ignore_index=True)

    # Proceed with your TF-IDF vectorization and cosine similarity calculation
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(temp_df['cleaned_product_name'])
    
    # Calculate cosine similarity with the last item (input product)
    cosine_sim = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
    
    # Get the indices of the top N similar products
    indices = cosine_sim.argsort()[0][-top_n:][::-1]
    
    # Return the top N similar products' names and their similarity scores
    similar_products = df.iloc[indices]
    similar_products_scores = cosine_sim[0][indices]
    
    return similar_products['product_name'], similar_products_scores

# Example usage
input_product_name = "Ice Cream"
similar_products, scores = find_similar_products(input_product_name, df, 10)
for product, score in zip(similar_products, scores):
    print(f"{product}: {score}")

Ice Cream: 1.0
Ice Cream Cake Ice Cream: 0.920664999193916
Ice Cream, Chocolate: 0.8700409168179906
Chocolate Ice Cream: 0.8700409168179906
Ice Cream Chocolate: 0.8700409168179906
Sweet Cream Ice Cream: 0.8542200890467642
Ice Cream Bars: 0.8441625691704493
Cookies N Cream Ice Cream: 0.8425304712725254
Cookies 'N Cream Ice Cream: 0.8425304712725254
Ice Cream, Cookies & Cream: 0.8425304712725254


In [9]:
pip install python-Levenshtein


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
from fuzzywuzzy import process
import pandas as pd
def find_similar_products(input_product_name, df, top_n=10):
    # Extract product names to a list
    product_names = df['product_name'].tolist()
    
    # Use fuzzywuzzy's process to find matches
    results = process.extract(input_product_name, product_names, limit=top_n)
    
    # Convert results to DataFrame for nicer output and potentially further processing
    similar_products_df = pd.DataFrame(results, columns=['Product Name', 'Similarity Score'])
    
    return similar_products_df


In [22]:
input_product_name = "basmati rice"
similar_products_df = find_similar_products(input_product_name, df, 10)
print(similar_products_df)


                                        Product Name  Similarity Score
0                                       Basmati Rice               100
1                                  Aged Basmati Rice                95
2           Chicken Curry with Seasoned Basmati Rice                90
3  World Cuisine Certified Halal Chicken Tikka Ma...                90
4                         Organic White Basmati Rice                90
5  Chicken Tikka Masala with Cumin Infused Basmat...                90
6                 Chicken Vindaloo with Basmati Rice                90
7                                Smoked Basmati Rice                90
8            Peas Pulav Basmati Rice With Green Peas                90
9              Organic California White Basmati Rice                90


In [37]:
import os

print(os.environ['OPENAPIKEY'])

KeyError: 'OPENAPIKEY'