Amazon Review Opinion Search Engine

This notebook implements a complete Opinion Search Engine for Amazon reviews with multiple retrieval methods.

Project Overview

This implementation includes:

Baseline: Boolean search with aspect and opinion term matching

Method 1: Boolean + Rating search filtering

Method 2: ML Classification Filtering (Naive Bayes)

Method 3: Grammar/Sentence-Based Opinion Filtering

Method 4: TF-IDF Vector Similarity Ranking

Dataset

File: content/reviews_segment.csv

Fields Used: review_id, review_text, customer_review_rating

Size: ~210K reviews

## PART 1: Setup + Imports


In [30]:
# Core libraries, mainly for data handling and numerial operations within the colab
import pandas as pd
import numpy as np
import re
import os
from collections import defaultdict
from pathlib import Path

# Importing ML libraries for splitting and training data set
# Tfidf Vectorizer is included to convert text into feature vectors
# MultinomiaNB for Naive Bayes classifier for text classification for one of my methods
# cosine similarity to compute which TF-IDF vectors are similar
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity

# Importing core NLP tools I need for preprocessing the text
# word_tokenize and sent_tokenize help break the reviews into words and sentences
# stopwords gives me a list of common words to remove during cleaning
# WordNetLemmatizer helps reduce words to their base form for consistency
# pos_tag assigns part-of-speech tags to each token for more detailed preprocessing
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

# Downloading all required NLTK resources so the NLP steps run without errors
# This includes tokenizers, stopword lists, lemmatizer dictionaries, and POS tagging models
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

True

PART 2: Load + Preprocess Data

In [31]:
# Defining file paths in a dynamic way so nothing is hard-coded
# base_dir points to the current working directory, and csv_path builds the path to my review file
base_dir = Path.cwd()
csv_path = base_dir / "reviews_segment.csv"

# Loading a manageable portion of the dataset into a DataFrame so the notebook runs fast in Colab
# I’m only pulling the columns I actually need and limiting rows to 10,000 for quicker processing
# (This subset is enough to test all retrieval methods without crashing runtime)
print(f"Loading dataset from: {csv_path}")
df = pd.read_csv(
    csv_path,
    usecols=["review_id", "review_text", "customer_review_rating"],
    encoding="latin-1",
    dtype={"review_id": str},
    nrows=10000
)

# Printing out basic info to verify the file loaded correctly and the structure looks right
print(f"Loaded {len(df):,} reviews")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

Loading dataset from: /content/reviews_segment.csv
Loaded 10,000 reviews
Columns: ['review_id', 'review_text', 'customer_review_rating']

First few rows:


Unnamed: 0,review_id,review_text,customer_review_rating
0,'R10019MUX6F9A','I\'ve had this product for about a month and ...,4
1,'R1002I943QCT20','I have been using the Macintosh OSX version o...,2
2,'R1003RILN06MX1','The Partition Manager is a great product. It\...,5
3,'R100523NBIQIEV','If you plan on getting this program go to htt...,3
4,'R1006KJEGKGV0O','I researched for months (on-line and in store...,5


In [32]:
# Cleaning up the review_id field by removing stray quotes and trimming whitespace
# (Some CSV exports include weird formatting, so this keeps IDs consistent)
df["review_id"] = df["review_id"].str.replace("'", "").str.strip()

# Quick check to see how many reviews are missing text or ratings before preprocessing
print(f"Missing review_text: {df['review_text'].isna().sum()}")
print(f"Missing ratings: {df['customer_review_rating'].isna().sum()}")

# Dropping any rows that don't have review text since they can't be used by any retrieval method
# Resetting the index afterwards so the DataFrame stays clean and sequential
df = df[df['review_text'].notna()].copy()
df = df.reset_index(drop=True)

# Showing the final count after basic cleanup
print(f"After cleaning: {len(df):,} reviews")


Missing review_text: 0
Missing ratings: 0
After cleaning: 10,000 reviews


In [33]:
# Initialize stopwords and lemmatizer
# These are the core components I'll use to clean and normalize the review text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Extended stopwords list (including common words), got this from the link to the github in the res_proj.pdf
# This removes very common pronouns, helper verbs, and filler words that don’t add meaning
extended_stopwords = stop_words.union({
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
    "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
    "while", "of", "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
    "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
})

# Convert POS tag to wordnet format for lemmatization
# This makes lemmatization more accurate because it knows the correct part of speech
def get_wordnet_pos(tag):

    if tag.startswith('J'):
        return 'a'  # adjective
    elif tag.startswith('V'):
        return 'v'  # verb
    elif tag.startswith('N'):
        return 'n'  # noun
    elif tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default to noun

"""
    Comprehensive preprocessing function below for:
    - Lowercase
    - Remove punctuation
    - Tokenize
    - Remove stopwords
    - Lemmatize
    """

def preprocess_text(text):
    if pd.isna(text) or text == "":
        return ""

    # Convert to lowercase
    text = str(text).lower()

    # Remove punctuation and special characters (keep alphanumeric and spaces)
    # this counts for smileys too which is mentioned to take care of while preprocessing
    text = re.sub(r"[^a-z0-9\s]", " ", text)

    # Splitting the cleaned text into individual word tokens
    tokens = word_tokenize(text)

    # POS tagging for better lemmatization
    pos_tags = pos_tag(tokens)

    # Lemmatizing tokens based on POS tags and filtering out custom stopwords
    lemmatized_words = []
    for word, pos_tag_val in pos_tags:
        if word.isalnum() and word not in extended_stopwords and len(word) > 1:
            wordnet_pos = get_wordnet_pos(pos_tag_val)
            lemmatized_word = lemmatizer.lemmatize(word, wordnet_pos)
            lemmatized_words.append(lemmatized_word)

    return " ".join(lemmatized_words)

print("Preprocessing function defined successfully!")


Preprocessing function defined successfully!


In [34]:
# Applying my preprocessing function to every review in the dataset
# This step cleans, tokenizes, removes stopwords, and lemmatizes the text
# (It can take a bit depending on how many rows I'm working with)
print("Preprocessing reviews... This may take a few minutes for large datasets...")
df["clean_text"] = df["review_text"].apply(preprocess_text)

# Dropping any rows where the cleaned text ended up empty after preprocessing
# This keeps the dataset meaningful and avoids running methods on blank reviews
df = df[df["clean_text"].str.strip() != ""].copy()
df = df.reset_index(drop=True)

# Showing how many valid reviews remain after the full cleaning pipeline
# Previewing some of the original vs cleaned text to verify everything looks correct
print(f"After preprocessing: {len(df):,} reviews with valid text")
print("\nSample preprocessed text:")
df[["review_id", "review_text", "clean_text", "customer_review_rating"]].head()


Preprocessing reviews... This may take a few minutes for large datasets...
After preprocessing: 10,000 reviews with valid text

Sample preprocessed text:


Unnamed: 0,review_id,review_text,clean_text,customer_review_rating
0,R10019MUX6F9A,'I\'ve had this product for about a month and ...,product month half instal device get 35 40 tel...,4
1,R1002I943QCT20,'I have been using the Macintosh OSX version o...,use macintosh osx version bridge baron 16 year...,2
2,R1003RILN06MX1,'The Partition Manager is a great product. It\...,partition manager great product easy use price...,5
3,R100523NBIQIEV,'If you plan on getting this program go to htt...,plan get program go http www bloodshed net get...,3
4,R1006KJEGKGV0O,'I researched for months (on-line and in store...,research month line store decide car seat many...,5


PART 3: Boolean Baseline Engine

Baseline Requirements:

Aspect term matching (OR logic): Match any aspect term

Aspect AND opinion matching: Must contain at least one aspect AND at least one opinion

Aspect OR opinion matching: Must contain at least aspect OR at least one opinion

I am building an inverted index (postings list) and use set operations for Boolean retrieval.

In [35]:
# Build inverted index (postings list) for efficient retrieval
# Key: word, Value: set of review_ids containing that word
postings = defaultdict(set)

print("Building inverted index...")
for row in df.itertuples():
    rid = row.review_id
    clean_text = row.clean_text

    # Making sure the clean_text isn't empty before processing
    if pd.notna(clean_text) and clean_text.strip():
        # Splitting the cleaned text into individual tokens and adding them to the index
        tokens = clean_text.split()
        for token in tokens:
            postings[token].add(rid)

# Quick summary of the inverted index to confirm it was built correctly
print(f"Inverted index built! Unique terms: {len(postings):,}")
print(f"Sample terms: {list(postings.keys())[:10]}")


Building inverted index...
Inverted index built! Unique terms: 33,549
Sample terms: ['product', 'month', 'half', 'instal', 'device', 'get', '35', '40', 'telemarketing', 'call']


In [36]:
# Preprocesses the query terms so they match the same cleaning/lemmatization used on the reviews
# This keeps the search consistent and avoids mismatches between user queries and the dataset
def preprocess_query_terms(terms):
    processed_terms = []
    for term in terms:
        # Preprocess each term
        processed = preprocess_text(term)
        if processed.strip():
            # Get all tokens from processed term
            tokens = processed.split()
            processed_terms.extend(tokens)
    return processed_terms

# Test 1: Aspect OR Aspect
# Returns any review that contains at least one of the aspect terms (logical OR)
def boolean_search_aspect_or(aspect_terms):
    aspect_tokens = preprocess_query_terms(aspect_terms)
    result = set()
    for token in aspect_tokens:
        if token in postings:
            result |= postings[token]
    return result

# Test 2: (Aspect OR Aspect) AND (Opinion OR Opinion)
# This is the baseline method described in the project: aspect must appear AND opinion must appear
def boolean_search_aspect_and_opinion(aspect_terms, opinion_terms):
    aspect_tokens = preprocess_query_terms(aspect_terms)
    opinion_tokens = preprocess_query_terms(opinion_terms)

    # Get all reviews with aspect terms (OR)
    aspect_reviews = set()
    for token in aspect_tokens:
        if token in postings:
            aspect_reviews |= postings[token]

    # Get all reviews with opinion terms (OR)
    opinion_reviews = set()
    for token in opinion_tokens:
        if token in postings:
            opinion_reviews |= postings[token]

    # Return intersection (AND)
    return aspect_reviews & opinion_reviews

# Test 3: Aspect OR Opinion
# Returns reviews containing at least one aspect term OR at least one opinion term
def boolean_search_aspect_or_opinion(aspect_terms, opinion_terms):
    aspect_tokens = preprocess_query_terms(aspect_terms)
    opinion_tokens = preprocess_query_terms(opinion_terms)

    result = set()
    all_tokens = aspect_tokens + opinion_tokens
    for token in all_tokens:
        if token in postings:
            result |= postings[token]
    return result

# Baseline wrapper for convenience (just calls Test 2)
# This is the core method used for evaluation in the project
def baseline(aspect_terms, opinion_terms):
    return boolean_search_aspect_and_opinion(aspect_terms, opinion_terms)

print("Boolean search functions defined!")


Boolean search functions defined!


PART 4: Advanced Methods

Method 1: Boolean + Rating Search

This method extends the baseline by filtering results based on star rating:

Positive opinion: Only retrieve reviews with rating > 3

Negative opinion: Only retrieve reviews with rating <= 3

The star rating serves as a polarity indicator.

In [37]:
# Define opinion polarity mapping
# Positive opinions: strong, useful, sharp
# Negative opinions: poor, problem, click (in negative context)
polarity = {
    "poor": "neg",
    "strong": "pos",
    "click": "neg",
    "problem": "neg",
    "useful": "pos",
    "sharp": "pos"
}

# Determines whether the opinion terms correspond to a positive or negative sentiment
# This feeds into the rating filter step in Method 1
def determine_opinion_polarity(opinion_terms):
    opinion_tokens = preprocess_query_terms(opinion_terms)
    if opinion_tokens:
        # Check each token against polarity dictionary
        for token in opinion_tokens:
            if token in polarity:
                return polarity[token]
        # Default: check original terms
        for term in opinion_terms:
            term_lower = term.lower()
            if term_lower in polarity:
                return polarity[term_lower]
    # Default to negative if unknown
    return "neg"

# Method 1: Boolean + Rating Filter
# Uses baseline retrieval first, then narrows results based on the review rating
def method1_rating_filter(aspect_terms, opinion_terms):
    # Get baseline results
    base_results = baseline(aspect_terms, opinion_terms)

    if not base_results:
        return set()

    # Determine opinion polarity
    opinion_polarity = determine_opinion_polarity(opinion_terms)

    # Create rating lookup dictionary for efficiency
    rating_dict = dict(zip(df["review_id"], df["customer_review_rating"]))

    filtered_results = set()
    for rid in base_results:
        if rid in rating_dict:
            rating = rating_dict[rid]
            # Handle non-numeric ratings
            try:
                rating = float(rating)
                if opinion_polarity == "pos":
                    # Positive opinion: rating > 3
                    if rating > 3:
                        filtered_results.add(rid)
                else:
                    # Negative opinion: rating <= 3
                    if rating <= 3:
                        filtered_results.add(rid)
            except (ValueError, TypeError):
                continue

    return filtered_results

print("Method 1 (Boolean + Rating) implemented!")


Method 1 (Boolean + Rating) implemented!


Method 2: ML Classification Filtering

Train a classifier (Naive Bayes) to classify reviews as positive/negative using star rating labels:

Label: Positive (rating > 3) or Negative (rating <= 3)

Features: TF-IDF vectorized text

Classifier: Multinomial Naive Bayes

Filtering: Apply classifier predictions to filter retrieved reviews

In [38]:
# Create labels based on star rating
df["label"] = (df["customer_review_rating"] > 3).astype(int)
# 1 = positive (rating > 3), 0 = negative (rating <= 3)

print("Labels created:")
print(f"Positive reviews (label=1): {df['label'].sum():,}")
print(f"Negative reviews (label=0): {(df['label'] == 0).sum():,}")


Labels created:
Positive reviews (label=1): 7,046
Negative reviews (label=0): 2,954


In [39]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"],
    df["label"],
    test_size=0.2,
    random_state=42
)

print(f"Training set: {len(X_train):,} reviews")
print(f"Test set: {len(X_test):,} reviews")


Training set: 8,000 reviews
Test set: 2,000 reviews


In [40]:
# TF-IDF Vectorization
print("Training TF-IDF vectorizer...")
vectorizer = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.95)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(f"TF-IDF features: {X_train_vec.shape[1]:,}")


Training TF-IDF vectorizer...
TF-IDF features: 5,000


In [41]:
# Train Naive Bayes Classifier
print("Training Naive Bayes classifier...")
clf = MultinomialNB(alpha=1.0)
clf.fit(X_train_vec, y_train)

# Evaluate on test set
train_score = clf.score(X_train_vec, y_train)
test_score = clf.score(X_test_vec, y_test)

print(f"Training accuracy: {train_score:.4f}")
print(f"Test accuracy: {test_score:.4f}")


Training Naive Bayes classifier...
Training accuracy: 0.7921
Test accuracy: 0.7565


In [42]:
# Predict labels for entire dataset (for filtering in Method 2)
print("Predicting labels for entire dataset...")
X_all_vec = vectorizer.transform(df["clean_text"])
df["pred_label"] = clf.predict(X_all_vec)

print(f"Predictions complete!")
print(f"Predicted positive: {df['pred_label'].sum():,}")
print(f"Predicted negative: {(df['pred_label'] == 0).sum():,}")


Predicting labels for entire dataset...
Predictions complete!
Predicted positive: 9,080
Predicted negative: 920


In [43]:
def method2_ml_classifier(aspect_terms, opinion_terms):
    # Method 2: ML Classification Filtering
    # - First retrieve reviews containing aspect terms (OR logic)
    # - Then filter by predicted sentiment from classifier
    # Get reviews with aspect terms
    aspect_tokens = preprocess_query_terms(aspect_terms)
    aspect_reviews = set()
    for token in aspect_tokens:
        if token in postings:
            aspect_reviews |= postings[token]

    if not aspect_reviews:
        return set()

    # Determine desired polarity based on opinion
    opinion_polarity = determine_opinion_polarity(opinion_terms)
    want_label = 1 if opinion_polarity == "pos" else 0

    # Create prediction lookup dictionary
    pred_dict = dict(zip(df["review_id"], df["pred_label"]))

    # Filter by predicted label
    filtered_results = set()
    for rid in aspect_reviews:
        if rid in pred_dict:
            if pred_dict[rid] == want_label:
                filtered_results.add(rid)

    return filtered_results

print("Method 2 (ML Classification) implemented!")


Method 2 (ML Classification) implemented!



Method 3: Grammar/Sentence-Based Opinion Filtering

Parse each review into sentences and only keep reviews where aspect and opinion appear:

Same sentence: Aspect and opinion in the same sentence

Window-based: Within N words of each other

Implementation: Use sentence tokenization and word window checking

In [44]:
def method3_sentence_filtering(aspect_terms, opinion_terms, window_size=10):
    # Method 3: Grammar/Sentence-Based Opinion Filtering

    # Only keep reviews where aspect and opinion terms appear:
    # - In the same sentence, OR
    # - Within a window of N words

    # Args:
    #     aspect_terms: List of aspect terms
    #     opinion_terms: List of opinion terms
    #     window_size: Maximum word distance between aspect and opinion (default: 10)

    # Returns:
    #     Set of review_ids that meet the criteria
    aspect_tokens = preprocess_query_terms(aspect_terms)
    opinion_tokens = preprocess_query_terms(opinion_terms)

    if not aspect_tokens or not opinion_tokens:
        return set()

    matching_reviews = set()

    # Process each review
    for row in df.itertuples():
        review_id = row.review_id
        review_text = row.review_text

        if pd.isna(review_text):
            continue

        # Split review into sentences
        sentences = sent_tokenize(str(review_text))

        # Check each sentence
        for sentence in sentences:
            sentence_lower = sentence.lower()

            # Preprocess sentence
            sentence_clean = preprocess_text(sentence)
            sentence_tokens = sentence_clean.split() if sentence_clean.strip() else []

            # Check if sentence contains any aspect token
            aspect_positions = []
            for i, token in enumerate(sentence_tokens):
                if token in aspect_tokens:
                    aspect_positions.append(i)

            # Check if sentence contains any opinion token
            opinion_positions = []
            for i, token in enumerate(sentence_tokens):
                if token in opinion_tokens:
                    opinion_positions.append(i)

            # If both aspect and opinion found in sentence
            if aspect_positions and opinion_positions:
                # Check if they are within window_size
                for a_pos in aspect_positions:
                    for o_pos in opinion_positions:
                        if abs(a_pos - o_pos) <= window_size:
                            matching_reviews.add(review_id)
                            break
                    if review_id in matching_reviews:
                        break
                if review_id in matching_reviews:
                    break

        # Also check across sentences using word window in full review
        if review_id not in matching_reviews:
            review_clean = preprocess_text(review_text)
            review_tokens = review_clean.split() if review_clean.strip() else []

            # Find all positions of aspect and opinion tokens
            aspect_positions = []
            opinion_positions = []
            for i, token in enumerate(review_tokens):
                if token in aspect_tokens:
                    aspect_positions.append(i)
                if token in opinion_tokens:
                    opinion_positions.append(i)

            # Check if any aspect and opinion are within window
            for a_pos in aspect_positions:
                for o_pos in opinion_positions:
                    if abs(a_pos - o_pos) <= window_size:
                        matching_reviews.add(review_id)
                        break
                if review_id in matching_reviews:
                    break

    return matching_reviews

print("Method 3 (Sentence-Based Filtering) implemented!")


Method 3 (Sentence-Based Filtering) implemented!


Method 4: TF-IDF Cosine Similarity Ranking

In [45]:
# Create TF-IDF vectorizer for Method 4 (separate from classifier vectorizer)
# This will be used for query-document similarity
print("Creating TF-IDF vectorizer for similarity ranking...")
method4_vectorizer = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.95)
all_reviews_tfidf = method4_vectorizer.fit_transform(df["clean_text"])

print(f"TF-IDF matrix shape: {all_reviews_tfidf.shape}")
print("Vectorizer ready for similarity computation!")


Creating TF-IDF vectorizer for similarity ranking...
TF-IDF matrix shape: (10000, 5000)
Vectorizer ready for similarity computation!


In [46]:
def method4_tfidf_ranking(aspect_terms, opinion_terms, similarity_threshold=0.1):
    # Method 4: TF-IDF Vector Similarity Ranking

    # - Get baseline results (Aspect AND Opinion)
    # - Create query vector from aspect and opinion terms
    # - Compute cosine similarity with all reviews
    # - Rank and filter by similarity threshold

    # Args:
    #     aspect_terms: List of aspect terms
    #     opinion_terms: List of opinion terms
    #     similarity_threshold: Minimum similarity score to include (default: 0.1)

    # Returns:
    #     Set of review_ids ranked and filtered by similarity
    # Get baseline results first
    base_results = baseline(aspect_terms, opinion_terms)

    if not base_results:
        return set()

    # Create query text from aspect and opinion terms
    query_text = " ".join(aspect_terms + opinion_terms)
    query_clean = preprocess_text(query_text)

    if not query_clean.strip():
        return base_results

    # Vectorize query
    query_vector = method4_vectorizer.transform([query_clean])

    # Create index mapping for review_ids
    review_id_to_index = {rid: idx for idx, rid in enumerate(df["review_id"])}

    # Filter base results to those that exist in our dataset
    valid_results = [rid for rid in base_results if rid in review_id_to_index]

    if not valid_results:
        return set()

    # Compute similarities only for base results (more efficient)
    result_indices = [review_id_to_index[rid] for rid in valid_results]
    result_vectors = all_reviews_tfidf[result_indices]

    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, result_vectors)[0]

    # Create mapping of review_id to similarity score
    similarity_scores = {}
    for idx, rid in enumerate(valid_results):
        similarity_scores[rid] = similarities[idx]

    # Filter by threshold and return
    filtered_results = {
        rid for rid, score in similarity_scores.items()
        if score >= similarity_threshold
    }

    return filtered_results

print("Method 4 (TF-IDF Ranking) implemented!")


Method 4 (TF-IDF Ranking) implemented!


PART 5: Evaluation Framework

Evaluation Metrics:

Precision = (# Relevant Retrieved) / (# Retrieved)

For manual evaluation, it provides retrieval counts and support code

Test Queries:

audio quality : poor

wifi signal : strong

mouse button : click problem

gps map : useful

image quality : sharp

In [47]:
# Defining test queries
test_queries = [
    (["audio", "quality"], ["poor"], "audio_quality"),
    (["wifi", "signal"], ["strong"], "wifi_signal"),
    (["mouse", "button"], ["click", "problem"], "mouse_button"),
    (["gps", "map"], ["useful"], "gps_map"),
    (["image", "quality"], ["sharp"], "image_quality")
]

print("Test queries defined:")
for aspect, opinion, name in test_queries:
    print(f"  - {name}: {aspect} : {opinion}")


Test queries defined:
  - audio_quality: ['audio', 'quality'] : ['poor']
  - wifi_signal: ['wifi', 'signal'] : ['strong']
  - mouse_button: ['mouse', 'button'] : ['click', 'problem']
  - gps_map: ['gps', 'map'] : ['useful']
  - image_quality: ['image', 'quality'] : ['sharp']


In [48]:
def evaluate_results(retrieved_set, relevant_set=None):
    """
    Evaluate retrieval results.

    Args:
        retrieved_set: Set of retrieved review_ids
        relevant_set: Set of relevant review_ids (optional, for manual evaluation)

    Returns:
        Dictionary with evaluation metrics
    """
    num_retrieved = len(retrieved_set)

    if relevant_set is not None:
        num_relevant = len(relevant_set)
        num_relevant_retrieved = len(retrieved_set & relevant_set)
        precision = num_relevant_retrieved / num_retrieved if num_retrieved > 0 else 0.0
    else:
        num_relevant = None
        num_relevant_retrieved = None
        precision = None

    return {
        "num_retrieved": num_retrieved,
        "num_relevant": num_relevant,
        "num_relevant_retrieved": num_relevant_retrieved,
        "precision": precision,
        "retrieved_ids": retrieved_set
    }

print("Evaluation function defined!")


Evaluation function defined!


PART 6: Run All Tests and Generate Results

Run all test queries through all methods and generate comparison table.

In [29]:
# Initialize results storage
results_data = []

print("Running all test queries through all methods...")
print("-" * 80)

for aspect_terms, opinion_terms, query_name in test_queries:
    print(f"\nQuery: {' '.join(aspect_terms)} : {' '.join(opinion_terms)}")
    print("-" * 80)

    # Baseline (boolean)
    baseline_results = baseline(aspect_terms, opinion_terms)
    baseline_eval = evaluate_results(baseline_results)
    print(f"Baseline: {baseline_eval['num_retrieved']} retrieved")

    # Method 1: Rating Filter
    method1_results = method1_rating_filter(aspect_terms, opinion_terms)
    method1_eval = evaluate_results(method1_results)
    print(f"Method 1 (Rating): {method1_eval['num_retrieved']} retrieved")

    # Method 2: ML Classifier
    method2_results = method2_ml_classifier(aspect_terms, opinion_terms)
    method2_eval = evaluate_results(method2_results)
    print(f"Method 2 (ML Classifier): {method2_eval['num_retrieved']} retrieved")

    # Method 3: Sentence Filtering
    method3_results = method3_sentence_filtering(aspect_terms, opinion_terms)
    method3_eval = evaluate_results(method3_results)
    print(f"Method 3 (Sentence): {method3_eval['num_retrieved']} retrieved")

    # Method 4: TF-IDF Ranking
    method4_results = method4_tfidf_ranking(aspect_terms, opinion_terms)
    method4_eval = evaluate_results(method4_results)
    print(f"Method 4 (TF-IDF): {method4_eval['num_retrieved']} retrieved")

    # Store results
    # Format Query as: ['aspect', 'term'] : ['opinion']
    query_str = f"{aspect_terms} : {opinion_terms}"

    # Initialize with placeholder values (matching reference.ipynb approach)
    # Rel = Ret initially, Prec = 1.0 initially (to be filled manually later)
    results_data.append({
        "Query": query_str,  # Will display as ['audio', 'quality'] : ['poor']
        "Query_Name": query_name,
        "Baseline_Ret": baseline_eval['num_retrieved'],
        "Baseline_Rel": baseline_eval['num_retrieved'],  # Placeholder: you fill manually in report
        "Baseline_Prec": 1.0,  # Placeholder: will be recalculated when Rel is updated
        "Method1_Ret": method1_eval['num_retrieved'],
        "Method1_Rel": method1_eval['num_retrieved'],  # Placeholder
        "Method1_Prec": 1.0,  # Placeholder
        "Method2_Ret": method2_eval['num_retrieved'],
        "Method2_Rel": method2_eval['num_retrieved'],  # Placeholder
        "Method2_Prec": 1.0,  # Placeholder
        "Method3_Ret": method3_eval['num_retrieved'],
        "Method3_Rel": method3_eval['num_retrieved'],  # Placeholder
        "Method3_Prec": 1.0,  # Placeholder
        "Method4_Ret": method4_eval['num_retrieved'],
        "Method4_Rel": method4_eval['num_retrieved'],  # Placeholder
        "Method4_Prec": 1.0,  # Placeholder
        # Store result sets for file generation
        "baseline_ids": baseline_results,
        "method1_ids": method1_results,
        "method2_ids": method2_results,
        "method3_ids": method3_results,
        "method4_ids": method4_results
    })

print("\n" + "-" * 80)
print("All queries completed!")


Running all test queries through all methods...
--------------------------------------------------------------------------------

Query: audio quality : poor
--------------------------------------------------------------------------------
Baseline: 68 retrieved
Method 1 (Rating): 45 retrieved
Method 2 (ML Classifier): 85 retrieved
Method 3 (Sentence): 56 retrieved
Method 4 (TF-IDF): 39 retrieved

Query: wifi signal : strong
--------------------------------------------------------------------------------
Baseline: 10 retrieved
Method 1 (Rating): 6 retrieved
Method 2 (ML Classifier): 145 retrieved
Method 3 (Sentence): 10 retrieved
Method 4 (TF-IDF): 9 retrieved

Query: mouse button : click problem
--------------------------------------------------------------------------------
Baseline: 167 retrieved
Method 1 (Rating): 57 retrieved
Method 2 (ML Classifier): 56 retrieved
Method 3 (Sentence): 80 retrieved
Method 4 (TF-IDF): 68 retrieved

Query: gps map : useful
----------------------------

In [49]:
# Create results df
results_df = pd.DataFrame(results_data)

# Display result
display_cols = ["Query", "Baseline_Ret", "Method1_Ret", "Method2_Ret", "Method3_Ret", "Method4_Ret"]
print("\nRetrieval Counts:")
print("-" * 100)
results_df[display_cols]



Retrieval Counts:
----------------------------------------------------------------------------------------------------


Unnamed: 0,Query,Baseline_Ret,Method1_Ret,Method2_Ret,Method3_Ret,Method4_Ret
0,"['audio', 'quality'] : ['poor']",68,45,85,56,39
1,"['wifi', 'signal'] : ['strong']",10,6,145,10,9
2,"['mouse', 'button'] : ['click', 'problem']",167,57,56,80,68
3,"['gps', 'map'] : ['useful']",11,8,120,3,5
4,"['image', 'quality'] : ['sharp']",46,37,1051,21,29


PART 7: Generate Output Files

Each file contains review_ids (one per line) for manual evaluation.

In [50]:
# Create output directory if it doesn't exist
output_dir = base_dir / "output"
output_dir.mkdir(exist_ok=True)

print(f"Output directory: {output_dir}")
print("\nGenerating output files...")

# Method names for file suffixes (following format: {aspect}_{test}.txt)
method_names = {
    "baseline": "test0",  # Baseline is test0
    "method1": "test1",
    "method2": "test2",
    "method3": "test3",
    "method4": "test4"
}

# Generate files for each query and method
files_generated = []

for row in results_data:
    query_name = row["Query_Name"]

    # Baseline
    baseline_file = output_dir / f"{query_name}_{method_names['baseline']}.txt"
    with open(baseline_file, 'w', encoding='utf-8') as f:
        for rid in sorted(row["baseline_ids"]):
            f.write(f"{rid}\n")
    files_generated.append(baseline_file.name)
    print(f"  Generated: {baseline_file.name} ({len(row['baseline_ids'])} reviews)")

    # Method 1
    method1_file = output_dir / f"{query_name}_{method_names['method1']}.txt"
    with open(method1_file, 'w', encoding='utf-8') as f:
        for rid in sorted(row["method1_ids"]):
            f.write(f"{rid}\n")
    files_generated.append(method1_file.name)
    print(f"  Generated: {method1_file.name} ({len(row['method1_ids'])} reviews)")

    # Method 2
    method2_file = output_dir / f"{query_name}_{method_names['method2']}.txt"
    with open(method2_file, 'w', encoding='utf-8') as f:
        for rid in sorted(row["method2_ids"]):
            f.write(f"{rid}\n")
    files_generated.append(method2_file.name)
    print(f"  Generated: {method2_file.name} ({len(row['method2_ids'])} reviews)")

    # Method 3
    method3_file = output_dir / f"{query_name}_{method_names['method3']}.txt"
    with open(method3_file, 'w', encoding='utf-8') as f:
        for rid in sorted(row["method3_ids"]):
            f.write(f"{rid}\n")
    files_generated.append(method3_file.name)
    print(f"  Generated: {method3_file.name} ({len(row['method3_ids'])} reviews)")

    # Method 4
    method4_file = output_dir / f"{query_name}_{method_names['method4']}.txt"
    with open(method4_file, 'w', encoding='utf-8') as f:
        for rid in sorted(row["method4_ids"]):
            f.write(f"{rid}\n")
    files_generated.append(method4_file.name)
    print(f"  Generated: {method4_file.name} ({len(row['method4_ids'])} reviews)")

print(f"\nTotal files generated: {len(files_generated)}")


Output directory: /content/output

Generating output files...
  Generated: audio_quality_test0.txt (68 reviews)
  Generated: audio_quality_test1.txt (45 reviews)
  Generated: audio_quality_test2.txt (85 reviews)
  Generated: audio_quality_test3.txt (56 reviews)
  Generated: audio_quality_test4.txt (39 reviews)
  Generated: wifi_signal_test0.txt (10 reviews)
  Generated: wifi_signal_test1.txt (6 reviews)
  Generated: wifi_signal_test2.txt (145 reviews)
  Generated: wifi_signal_test3.txt (10 reviews)
  Generated: wifi_signal_test4.txt (9 reviews)
  Generated: mouse_button_test0.txt (167 reviews)
  Generated: mouse_button_test1.txt (57 reviews)
  Generated: mouse_button_test2.txt (56 reviews)
  Generated: mouse_button_test3.txt (80 reviews)
  Generated: mouse_button_test4.txt (68 reviews)
  Generated: gps_map_test0.txt (11 reviews)
  Generated: gps_map_test1.txt (8 reviews)
  Generated: gps_map_test2.txt (120 reviews)
  Generated: gps_map_test3.txt (3 reviews)
  Generated: gps_map_test4.t

PART 8: Final Comparison Table

Summary table with retrieval counts for all methods across all queries.

In [61]:
# Create comprehensive comparison table
# Precision formula: Precision = (# Relevant Retrieved) / (# Retrieved)

def calc_precision(retrieved, relevant):
    """Calculate precision: (# Relevant Retrieved) / (# Retrieved)"""
    if relevant is None or retrieved is None:
        return None
    if retrieved == 0:
        return 0.0
    return round(relevant / retrieved, 1)  # Round to 1 decimal place

comparison_data = []

for row in results_data:
    comparison_data.append({
        "Query": row["Query"],
        # Baseline
        "Baseline Ret": row["Baseline_Ret"],
        "Baseline Rel": row["Baseline_Rel"],
        "Baseline Prec": calc_precision(row["Baseline_Ret"], row["Baseline_Rel"]),
        # Method 1
        "M1 Ret": row["Method1_Ret"],
        "M1 Rel": row["Method1_Rel"],
        "M1 Prec": calc_precision(row["Method1_Ret"], row["Method1_Rel"]),
        # Method 2
        "M2 Ret": row["Method2_Ret"],
        "M2 Rel": row["Method2_Rel"],
        "M2 Prec": calc_precision(row["Method2_Ret"], row["Method2_Rel"]),
        # Method 3
        "M3 Ret": row["Method3_Ret"],
        "M3 Rel": row["Method3_Rel"],
        "M3 Prec": calc_precision(row["Method3_Ret"], row["Method3_Rel"]),
        # Method 4
        "M4 Ret": row["Method4_Ret"],
        "M4 Rel": row["Method4_Rel"],
        "M4 Prec": calc_precision(row["Method4_Ret"], row["Method4_Rel"])
    })

comparison_df = pd.DataFrame(comparison_data)

print("\n" + "-" * 150)
print("COMPREHENSIVE RESULTS TABLE")
print("-" * 150)

# Display the full table with all columns in the requested format
comparison_df


------------------------------------------------------------------------------------------------------------------------------------------------------
COMPREHENSIVE RESULTS TABLE
------------------------------------------------------------------------------------------------------------------------------------------------------


Unnamed: 0,Query,Baseline Ret,Baseline Rel,Baseline Prec,M1 Ret,M1 Rel,M1 Prec,M2 Ret,M2 Rel,M2 Prec,M3 Ret,M3 Rel,M3 Prec,M4 Ret,M4 Rel,M4 Prec
0,"['audio', 'quality'] : ['poor']",68,22,0.3,45,45,1.0,85,85,1.0,56,56,1.0,39,39,1.0
1,"['wifi', 'signal'] : ['strong']",10,10,1.0,6,6,1.0,145,145,1.0,10,10,1.0,9,9,1.0
2,"['mouse', 'button'] : ['click', 'problem']",167,167,1.0,57,57,1.0,56,56,1.0,80,80,1.0,68,68,1.0
3,"['gps', 'map'] : ['useful']",11,11,1.0,8,8,1.0,120,120,1.0,3,3,1.0,5,5,1.0
4,"['image', 'quality'] : ['sharp']",46,46,1.0,37,37,1.0,1051,1051,1.0,21,21,1.0,29,29,1.0
