### Load Dataset

In [4]:
from google.colab import files
uploaded = files.upload()

Saving GoogleReview_data_cleaned.csv to GoogleReview_data_cleaned.csv


In [5]:
import pandas as pd

# Load CSV file
df = pd.read_csv('GoogleReview_data_cleaned.csv')

# Display the first few rows
df.head()

Unnamed: 0,Author,Rating,Review,Restaurant,Location
0,Jia Pin Lee,4.0,Came here for the High Tea. Great service espe...,Cuisines Restaurant,Ipoh
1,Chui Yi Lum,2.0,"5 stars for the service, even though some of t...",Cuisines Restaurant,Ipoh
2,liezel wong,1.0,"Hi, thank you for your service. But! i feel so...",Cuisines Restaurant,Ipoh
3,Nazri Nor,1.0,I have the worse buffer dinner ever so far. Th...,Cuisines Restaurant,Ipoh
4,Fakru Imran's Channel,5.0,"That's are Known 5 Elmark "" 9H72 "" & KDK "" 3 K...",Cuisines Restaurant,Ipoh


---

# **1. Named Entity Recognition (NER)**

### Install spaCy and Download Model

In [2]:
!pip install -q spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### Load Model & Run NER on Sample Reviews

In [7]:
import spacy
import pandas as pd

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Load the cleaned review dataset
df = pd.read_csv("GoogleReview_data_cleaned.csv")

# Drop empty reviews and select 5 random ones
sample_reviews = df['Review'].dropna().sample(5, random_state=42)

# Apply NER
for i, review in enumerate(sample_reviews):
    print(f"\nReview {i+1}:\n{review}")
    doc = nlp(review)
    for ent in doc.ents:
        print(f"  ➤ {ent.text} ({ent.label_})")



Review 1:
너무 너무 맛있는 한음식! Simply the best Korean restaurant I have been to, easily in Malaysia. They serve urban Korean food, therefore don’t expect to see bibimbap here. Fans of fried and crispy chicken like me will find this place remarkable. They
  ➤ Korean (NORP)
  ➤ Malaysia (GPE)
  ➤ Korean (NORP)
  ➤ n’t (GPE)

Review 2:
The food serving was fast during peak time. Good environment. Friendly and responsive service from the waiter and waitress. Food taste is good and the portion is quite big. It is a worthwhile experience.

Review 3:
Awesome place to chill out

Review 4:
Food was mediocre and small portion.Menu price is on higher side. The ambience was alright. Staff was not paying attention most of the time. need some improvements.

Review 5:
The pork ribs are good. They do have complimentary meal for kids (4 options to choose from). It wasn't crowded when I was there. Nice place to chill with friends. The staffs are friendly.
  ➤ 4 (CARDINAL)


---

# **2. Word Sense Disambiguation**

### Install Required Library

In [24]:
!pip install -q nltk pywsd

### Download NLTK Resources

In [26]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Run WSD using Lesk Algorithm

In [31]:
from nltk.tokenize import sent_tokenize, word_tokenize
from pywsd.lesk import simple_lesk
import pandas as pd

# pick a long-enough review randomly
review = df[df['Review'].str.len() > 100]['Review'].sample(1).values[0]

print("Selected Review:\n", review)

# Apply Lesk for WSD
sentences = sent_tokenize(review)
for sent in sentences:
    print(f"\nSentence: {sent}")
    words = word_tokenize(sent)
    for word in words:
        synset = simple_lesk(sent, word)
        if synset:
            print(f"  ➤ '{word}' → {synset.definition()}")


Selected Review:
 The taste is getting milder. I still preferred the taste of olden Choon Hui’s Laksa. The taste was much stronger and more authentic Sarawak Laksa back then. Anyway, it’s still one of the best Laksa in town. Do also order toast bread and

Sentence: The taste is getting milder.
  ➤ 'taste' → a brief experience of something
  ➤ 'is' → have an existence, be extant
  ➤ 'getting' → undergo (as of injuries and illnesses)
  ➤ 'milder' → moderate in type or degree or effect or force; far from extreme

Sentence: I still preferred the taste of olden Choon Hui’s Laksa.
  ➤ 'I' → used of a single unit or thing; not two or more
  ➤ 'still' → make motionless
  ➤ 'preferred' → give preference to one creditor over another
  ➤ 'taste' → experience briefly
  ➤ 'olden' → long past
  ➤ 's' → an abundant tasteless odorless multivalent nonmetallic element; best known in yellow crystals; occurs in many sulphide and sulphate minerals and even in native form (especially in volcanic regions)

S

---
# **3. Sentence Sentiment Classification**

### Install Required Libraries

In [32]:
!pip install -q transformers torch nltk

### Load Model & Tokenizer

In [33]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# Load the RoBERTa model fine-tuned for sentiment
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [38]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Run Sentiment Classification on a Review Sentence-by-Sentence

In [39]:
# Choose a sample review
review = df[df['Review'].str.len() > 100]['Review'].sample(1, random_state=123).values[0]
# print("\nSelected Review:\n", review)

# Tokenize into sentences
sentences = sent_tokenize(review)

# Predict sentiment for each sentence
labels = ['Negative', 'Neutral', 'Positive']
for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        predicted = torch.argmax(probs, dim=1).item()
        print(f"\nSentence: {sentence}")
        print(f"  \n➤ Sentiment: {labels[predicted]} ({probs[0][predicted].item():.2f} confidence)")



Sentence: Its a place for those who want to taste the best briyani in calming environment with soft music.I'm loving it😍😍
  
➤ Sentiment: Positive (0.99 confidence)


---
# **4. Sentiment Lexicon Extraction**

### Setup (Install + Import)

In [42]:
!pip install -q nltk
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import pandas as pd


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load Review from Dataset & Run Lexicon Matching

In [46]:
# Select the same review used earlier (random_state=123)
review = df[df['Review'].str.len() > 100]['Review'].sample(1, random_state=123).values[0]
print("Selected Review:\n", review)

# Load VADER
sid = SentimentIntensityAnalyzer()

# Print sentiment lexicon scores for each word (if in VADER)
print("\nSentiment Lexicon Match:\n")
for word in word_tokenize(review):
    score = sid.lexicon.get(word.lower())
    if score:
        print(f"  ➤ {word}: {score:+.2f}")


Selected Review:
 Its a place for those who want to taste the best briyani in calming environment with soft music.I'm loving it😍😍

Sentiment Lexicon Match:

  ➤ want: +0.30
  ➤ best: +3.20
  ➤ calming: +1.70
  ➤ loving: +2.90


---
# **5. Polarity & Sentiment Strength**

In [49]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

# Same review as before
review = df[df['Review'].str.len() > 100]['Review'].sample(1, random_state=123).values[0]
print("Selected Review:\n", review)

# Analyze using VADER
sid = SentimentIntensityAnalyzer()
scores = sid.polarity_scores(review)

# Print all scores
print("\nVADER Sentiment Scores:")
for k, v in scores.items():
    print(f"  ➤ {k.capitalize()}: {v:.3f}")


Selected Review:
 Its a place for those who want to taste the best briyani in calming environment with soft music.I'm loving it😍😍

VADER Sentiment Scores:
  ➤ Neg: 0.000
  ➤ Neu: 0.554
  ➤ Pos: 0.446
  ➤ Compound: 0.902


### Process All Reviews

In [51]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [53]:
# Initialize VADER
sid = SentimentIntensityAnalyzer()

# Function to apply VADER to one review
def analyze_sentiment(text):
    scores = sid.polarity_scores(str(text))
    return pd.Series({
        'compound': scores['compound'],
        'pos_strength': scores['pos'],
        'neu_strength': scores['neu'],
        'neg_strength': scores['neg'],
        'label': 'Positive' if scores['compound'] > 0.05 else
                 'Negative' if scores['compound'] < -0.05 else 'Neutral'
    })

# Apply to all reviews
sentiment_df = df['Review'].apply(analyze_sentiment)

# Combine with original dataframe
df_sentiment = pd.concat([df, sentiment_df], axis=1)

# Preview result
df_sentiment.head()


Unnamed: 0,Author,Rating,Review,Restaurant,Location,compound,pos_strength,neu_strength,neg_strength,label
0,Jia Pin Lee,4.0,Came here for the High Tea. Great service espe...,Cuisines Restaurant,Ipoh,0.9313,0.422,0.578,0.0,Positive
1,Chui Yi Lum,2.0,"5 stars for the service, even though some of t...",Cuisines Restaurant,Ipoh,0.6542,0.143,0.857,0.0,Positive
2,liezel wong,1.0,"Hi, thank you for your service. But! i feel so...",Cuisines Restaurant,Ipoh,0.9283,0.285,0.674,0.041,Positive
3,Nazri Nor,1.0,I have the worse buffer dinner ever so far. Th...,Cuisines Restaurant,Ipoh,-0.8977,0.0,0.735,0.265,Negative
4,Fakru Imran's Channel,5.0,"That's are Known 5 Elmark "" 9H72 "" & KDK "" 3 K...",Cuisines Restaurant,Ipoh,0.6249,0.203,0.797,0.0,Positive
