In [None]:
# Install required libraries
# In Google Colab, spaCy may not be installed by default.
!pip -q install nltk spacy
# spaCy models (like en_core_web_sm) are separate downloads.
!python -m spacy download en_core_web_sm -q

import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# Counter is a dictionary-like class that counts occurrences of items.
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
# spacy.load loads a trained pipeline: tokenizer + tagger + lemmatizer, etc.
# "en_core_web_sm" is the small English model
nlp = spacy.load('en_core_web_sm')
print('✅ All libraries loaded successfully!')

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/12.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/12.8 MB[0m [31m10.2 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/12.8 MB[0m [31m51.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m10.3/12.8 MB[0m [31m109.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.8/12.8 MB[0m [31m197.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may nee

Question 1:

In [None]:
reviews = [
    "I love the battery life, it lasts for days!",
    "The battery is NOT charging. I am very angry.",
    "Charger broke after one week. Unacceptable quality.",
    "Best smartwatch ever. Tracks my running and sleeping perfectly.",
    "The run tracking is inaccurate and the sleep tracker is worse.",
    "Touchscreen is lagging and slow.",
    "I was waiting for a refund for two weeks.",
    "Don't buy this. It is a waste of money.",
    "Amazing features but the strap is uncomfortable.",
    "Support helped me fix the sync issue. Friendly service!"
]

# Let's look at the first review we'll use as our running example
example = reviews[4]
print("Our example review:")
print(f'  "{example}"')

Our example review:
  "The run tracking is inaccurate and the sleep tracker is worse."


In [None]:
# Uses NLTK’s tokenizer.
tokens = word_tokenize(example)

print(f"Original text  ({len(example.split())} words):")
print(f'  "{example}"\n')

print(f"Tokens ({len(tokens)} total):")
print(tokens)

Original text  (11 words):
  "The run tracking is inaccurate and the sleep tracker is worse."

Tokens (12 total):
['The', 'run', 'tracking', 'is', 'inaccurate', 'and', 'the', 'sleep', 'tracker', 'is', 'worse', '.']


In [None]:
stop_words = set(stopwords.words('english'))  # #Converts list to set for fast membership checks.

# Keep only words that are alphabetic and NOT stopwords
filtered_tokens = [
    word.lower() for word in tokens
    if word.isalpha() and word.lower() not in stop_words # careful about isalpha
]

print("After removing stopwords and punctuation:")
print(filtered_tokens)
print(f"\nWent from {len(tokens)} tokens → {len(filtered_tokens)} tokens")

After removing stopwords and punctuation:
['run', 'tracking', 'inaccurate', 'sleep', 'tracker', 'worse']

Went from 12 tokens → 6 tokens


In [None]:
# which words are considered stopwords? Here's a sample:
sample_stopwords = sorted(list(stop_words))[:30]
print("Sample stopwords (first 30):")
print(sample_stopwords)

Sample stopwords (first 30):
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't"]


In [None]:
stemmer = PorterStemmer()

stemmed = [stemmer.stem(word) for word in filtered_tokens]

print("Original → Stemmed:")
for orig, stem in zip(filtered_tokens, stemmed):
    print(f"  {orig:15s} → {stem}")

Original → Stemmed:
  love            → love
  battery         → batteri
  life            → life
  lasts           → last
  days            → day


Question 2:

In [None]:
def edit_distance(s1, s2):
    """Compute the Levenshtein edit distance between two strings."""
    m, n = len(s1), len(s2)
    # Build a (m+1) x (n+1) matrix
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Base cases: transforming to/from empty string
    for i in range(m + 1): dp[i][0] = i  # delete all chars
    for j in range(n + 1): dp[0][j] = j  # insert all chars

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i-1] == s2[j-1]:         # characters match — no cost
                dp[i][j] = dp[i-1][j-1]
            else:                            # pick cheapest operation
                dp[i][j] = 1 + min(
                    dp[i-1][j],    # deletion
                    dp[i][j-1],    # insertion
                    dp[i-1][j-1]   # substitution
                )
    return dp[m][n]

In [None]:
# Example: Typos and misspellings
word_pairs = [
    ("algorithm","logarithm")
]

print(f"{'Word 1':<12} {'Word 2':<12} {'Edit Distance':<15} {'Interpretation'}")
print("-" * 60)
for w1, w2 in word_pairs:
    d = edit_distance(w1, w2)
    interp = "very similar" if d <= 1 else ("similar" if d <= 2 else "different")
    print(f"{w1:<12} {w2:<12} {d:<15} {interp}")

Word 1       Word 2       Edit Distance   Interpretation
------------------------------------------------------------
algorithm    logarithm    3               different


Question 3:

In [None]:
# Vocabulary (ordered)

vocabulary = ["wireless", "noise", "cancelling", "headphones", "with", "active", "cancellation", "wired", "over-ear", "studio", "earbuds",
"reduction"]

# Query and Documents

q_text= "wireless noise cancelling headphones"
d1_text= "wireless headphones with active noise cancellation"
d2_text= "wired over-ear studio headphones"
d3_text= "wireless earbuds with noise reduction"

documents=[d1_text, d2_text, d3_text]
documents_names=["d1", "d2", "d3"]
docs=dict(zip(documents_names, documents))

query=q_text

# Function to convert text to BoW vector

def bow_vector(text, vocabulary):
  words=text.lower().split()
  return[words.count(term) for term in vocabulary]

#create vectors

q_vector=bow_vector(q_text, vocabulary)
d_vectors=[bow_vector(doc, vocabulary) for doc in documents]

print("Query vector:", q_vector)
for name, vector in zip(documents_names, d_vectors):
  print(f"{name} —> {vector}")

# Compute dot product similarity

import numpy as np

scores=[np.dot(q_vector, d_vector) for d_vector in d_vectors]

print("\nSimilarity Scores:")
for name, score in zip(documents_names, scores):
  print(f"{name} —> {score}")

best_match=documents_names[np.argmax(scores)]
print("\nBest Matching Document:", best_match)
print("Best Match Text:", docs[best_match])

# AI Resourses: Microsoft Copilot and Gemini


Query vector: [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
d1 —> [1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0]
d2 —> [0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0]
d3 —> [1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1]

Similarity Scores:
d1 —> 3
d2 —> 1
d3 —> 2

Best Matching Document: d1
Best Match Text: wireless headphones with active noise cancellation
