<a href="https://colab.research.google.com/github/moghanapriya-27/Datascience/blob/main/EX_4_SPACY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install spacy scikit-learn pandas

# Download English language model for spaCy
!python -m spacy download en_core_web_sm

# Import libraries
import pandas as pd
import numpy as np
import spacy
import re

# For vectorization and similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m94.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Load the dataset (make sure colab.csv is uploaded in /content/)
df = pd.read_csv("/content/Reviews.csv", sep=',', on_bad_lines='skip', engine='python')

# Drop missing/null entries from 'Text' column
reviews = df['Text'].dropna()

# Limit dataset for faster processing (e.g., 1000 reviews)
reviews = reviews.sample(1000, random_state=42).reset_index(drop=True)

reviews.head()

Unnamed: 0,Text
0,Despite this being 100% Arabica it had a sharp...
1,"I discovered Maldon salt about 10 years ago, e..."
2,These figs are firm on the outside and somewha...
3,"Absolutely outstanding, if you like the aroma ..."
4,"Well, I wasn't sure about this toy, what with ..."


In [None]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")


In [None]:
def preprocess_spacy(text):
    # Convert to lowercase
    text = text.lower()
    # Process text with spaCy
    doc = nlp(text)
    # Filter tokens: alphabetic, not stopwords, not punctuation
    tokens = [
        token.lemma_ for token in doc
        if token.is_alpha and not token.is_stop
    ]
    # Join cleaned tokens back into string
    return " ".join(tokens)


In [None]:
reviews_cleaned = reviews.apply(preprocess_spacy)
reviews_cleaned.head()


Unnamed: 0,Text
0,despite arabica sharpness flavor find blend in...
1,discover maldon salt year ago kitchen man cook...
2,fig firm outside somewhat soft inside feel hea...
3,absolutely outstanding like aroma bergamot des...
4,sure toy mixed review wilson beagle toy terrib...


In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(reviews_cleaned)

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (1000, 4671)


In [None]:
def search_reviews_spacy(query, top_k=5):
    # Preprocess query using spaCy
    cleaned_query = preprocess_spacy(query)
    # Convert to TF-IDF vector
    query_vector = vectorizer.transform([cleaned_query])
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    # Get top k indices
    top_indices = similarities.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        results.append({
            "original_review": reviews[idx],
            "cleaned_review": reviews_cleaned[idx],
            "similarity_score": similarities[idx]
        })
    return results


In [None]:
queries = ["great product with fast shipping", "disappointed"]

for q in queries:
    print(f"\nQuery: {q}\n{'='*50}")
    results = search_reviews_spacy(q, top_k=3)
    for res in results:
        print(f"Score: {res['similarity_score']:.4f}")
        print(f"Original: {res['original_review']}")
        print(f"Cleaned: {res['cleaned_review']}\n")



Query: great product with fast shipping
Score: 0.5545
Original: I'm giving this treat to my dogs for 3 years and they love it!  Very healthy and made in USA!<br />Very fast shipping too!
Cleaned: give treat dog year love healthy fast shipping

Score: 0.3918
Original: These are k-cups, great price and great flavor came really fast it was a good product. These are the best deal online for k-cups
Cleaned: k cup great price great flavor come fast good product good deal online k cup

Score: 0.3489
Original: The price was awesome, the shipping was incredibly fast.  I like this coffee better that most of the K-cup brands.  Just wish they able to be recycled.  But I would buy again from this company.
Cleaned: price awesome shipping incredibly fast like coffee well k cup brand wish able recycle buy company


Query: disappointed
Score: 0.3049
Original: you can never be disappointed by Lindt. Especially their Hazelnut ones.<br /><br />this one has full hazelnuts in it. crazily delicious. got to 