  NLP with spaCY

Upload, Extract and Read Data

In [14]:
from google.colab import files
import bz2
import pandas as pd

# Upload file from your local machine
uploaded = files.upload()  # upload your .bz2 file here

# Get the filename from the uploaded dict
filename = list(uploaded.keys())[0]

# Read lines from bz2 file and parse into label + review
reviews = []
labels = []

with bz2.open(filename, 'rt', encoding='utf-8', errors='ignore') as f:
    for i, line in enumerate(f):
        line = line.strip()
        if line.startswith('__label__'):
            label, review = line.split(' ', 1)
            reviews.append(review)
            labels.append(label)
        if i >= 199:  # Load first 200 lines for demo
            break

# Create DataFrame
df = pd.DataFrame({'label': labels, 'review': reviews})
df.head()


Saving test.ft.txt.bz2 to test.ft.txt (1).bz2


Unnamed: 0,label,review
0,__label__2,Great CD: My lovely Pat has one of the GREAT v...
1,__label__2,One of the best game music soundtracks - for a...
2,__label__1,Batteries died within a year ...: I bought thi...
3,__label__2,"works fine, but Maha Energy is better: Check o..."
4,__label__2,Great for the non-audiophile: Reviewed quite a...


Install and Load NLP Tools


In [None]:
!pip install spacy nltk
!python -m spacy download en_core_web_sm

import spacy
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load models
nlp = spacy.load("en_core_web_sm")
nltk.download("vader_lexicon")
sid = SentimentIntensityAnalyzer()


Perform NER + Sentiment Analysis

In [17]:
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from spacy.matcher import PhraseMatcher

# Load spaCy and Sentiment Analyzer
nlp = spacy.load("en_core_web_sm")
nltk.download("vader_lexicon")
sid = SentimentIntensityAnalyzer()

# Optional: Define a list of known products/brands for matching
known_brands_products = [
    "Maha Energy", "Powerex MH-C204F", "Chrono Trigger", "Sony", "Logitech", "Dell", "Apple", "Beats", "Pat"
]

# Create matcher for custom brand/product detection
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher.add("BRAND_PRODUCT", [nlp.make_doc(text) for text in known_brands_products])


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [18]:
# Analyze the first 5 reviews
for i, row in df.head(5).iterrows():
    print(f"\nReview {i+1}: {row['review']}")

    # Apply spaCy NLP pipeline
    doc = nlp(row['review'])

    # Extract named entities for PRODUCT and ORG (brands/products)
    ner_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["PRODUCT", "ORG"]]

    # Custom phrase match for known brands/products
    custom_matches = matcher(doc)
    matched_terms = list(set([doc[start:end].text for match_id, start, end in custom_matches]))

    # Sentiment Analysis
    score = sid.polarity_scores(row['review'])
    sentiment = "Positive" if score['compound'] > 0 else "Negative"

    # Output
    print("Named Entities (PRODUCT/ORG):", ner_entities)
    print("Custom Matched Brands/Products:", matched_terms)
    print("Sentiment:", sentiment)



Review 1: Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"
Named Entities (PRODUCT/ORG): [('GREAT', 'ORG'), ('EVERYBODY', 'ORG')]
Custom Matched Brands/Products: ['Pat']
Sentiment: Positive

Review 2: One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is a