<a href="https://colab.research.google.com/github/moghanapriya-27/Datascience/blob/main/ex_4a_231801102.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install nltk scikit-learn

import pandas as pd
import numpy as np
import nltk
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity




In [None]:
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Make sure you upload "colab.csv" in /content/ using Colab file browser
df = pd.read_csv("/content/Reviews.csv", engine='python', on_bad_lines='skip')

# Select the review text column
reviews = df['Text'].dropna()

# Limit to 10,000 reviews for faster processing
reviews = reviews.sample(10000, random_state=42).reset_index(drop=True)

reviews.head()

Unnamed: 0,Text
0,I enjoyed the popcorn at our minor league ball...
1,"If you drink coconut water, this is the best c..."
2,This gum tastes good at first but it has an 'i...
3,UNLIKE THE PHONEY LOW CARB AND NO CARB(ACTUALL...
4,"Before I got this drawer, boxes of pods were t..."


In [None]:
stop_words = set(stopwords.words('english'))
print(f"Loaded {len(stop_words)} stopwords")


Loaded 198 stopwords


In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Ensure punkt tokenizer is available
    try:
        word_tokenize("test") # Test if punkt is available
    except LookupError:
        print("Downloading punkt tokenizer...")
        nltk.download('punkt')

    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Join back
    return " ".join(tokens)

In [None]:
reviews_cleaned = reviews.apply(preprocess_text)
reviews_cleaned.head()


Unnamed: 0,Text
0,enjoyed popcorn minor league ballpark much cra...
1,drink coconut water best commercial one doubt ...
2,gum tastes good first interesting tomato after...
3,unlike phoney low carb carbactually loaded car...
4,got drawer boxes pods taking cabinet like scif...


In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(reviews_cleaned)

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (10000, 25813)


In [None]:
def search_reviews(query, top_k=5):
    # Preprocess query
    cleaned_query = preprocess_text(query)
    # Convert to vector
    query_vector = vectorizer.transform([cleaned_query])
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    # Get top k
    top_indices = similarities.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        results.append({
            "original_review": reviews[idx],
            "cleaned_review": reviews_cleaned[idx],
            "similarity_score": similarities[idx]
        })
    return results


In [None]:
queries = ["great product with fast shipping", "disappointed"]

for q in queries:
    print(f"\nQuery: {q}\n{'='*50}")
    results = search_reviews(q, top_k=3)
    for res in results:
        print(f"Score: {res['similarity_score']:.4f}")
        print(f"Original: {res['original_review']}")
        print(f"Cleaned: {res['cleaned_review']}\n")



Query: great product with fast shipping
Score: 0.4704
Original: I was pleased with the products I had recieved from them.  Very fast shipping!
Cleaned: pleased products recieved fast shipping

Score: 0.4424
Original: This item came very fast! And my French Bulldog ate this stuff even faster!Great product, great price, fast shipping. I recommend this to all dog lovers for a healthy dog treat!
Cleaned: item came fast french bulldog ate stuff even fastergreat product great price fast shipping recommend dog lovers healthy dog treat

Score: 0.4320
Original: After no longer being able to find this product in our local stores. We are so happy that we found a place to buy the chicken helper cheesy chicken enchilda. This product is no only delicious but a fast easy meal. I use can chicken and that makes this a super fast meal. The shipping was really fast too. Thanks
Cleaned: longer able find product local stores happy found place buy chicken helper cheesy chicken enchilda product delicious fa