### Load and Preview the Dataset

In [2]:
from google.colab import files
uploaded = files.upload()

Saving GoogleReview_data_cleaned.csv to GoogleReview_data_cleaned.csv


In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/GoogleReview_data_cleaned.csv')

# Preview the dataset
df.head()


Unnamed: 0,Author,Rating,Review,Restaurant,Location
0,Jia Pin Lee,4.0,Came here for the High Tea. Great service espe...,Cuisines Restaurant,Ipoh
1,Chui Yi Lum,2.0,"5 stars for the service, even though some of t...",Cuisines Restaurant,Ipoh
2,liezel wong,1.0,"Hi, thank you for your service. But! i feel so...",Cuisines Restaurant,Ipoh
3,Nazri Nor,1.0,I have the worse buffer dinner ever so far. Th...,Cuisines Restaurant,Ipoh
4,Fakru Imran's Channel,5.0,"That's are Known 5 Elmark "" 9H72 "" & KDK "" 3 K...",Cuisines Restaurant,Ipoh


### **A. Dependency-Based Parsing for Sentiment Analysis**

In [4]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [9]:
def extract_sentiment_phrases(text):
    doc = nlp(text)
    pairs = []
    for token in doc:
        if token.dep_ == "amod" and token.head.pos_ == "NOUN":
            pairs.append((token.text, token.head.text))
    return pairs


In [10]:
sample_df = df.head(50).copy()
sample_df['Opinion Phrases'] = sample_df['Review'].apply(lambda x: extract_sentiment_phrases(str(x)))
sample_df[['Review', 'Opinion Phrases']]

Unnamed: 0,Review,Opinion Phrases
0,Came here for the High Tea. Great service espe...,"[(Great, service), (great, service)]"
1,"5 stars for the service, even though some of t...","[(overall, experience)]"
2,"Hi, thank you for your service. But! i feel so...","[(more, love), (next, day)]"
3,I have the worse buffer dinner ever so far. Th...,"[(worse, dinner), (salty, dishes), (bbq, meat)..."
4,"That's are Known 5 Elmark "" 9H72 "" & KDK "" 3 K...","[(Known, restaurant), (nice, restaurant), (upc..."
5,I just came back from there. 2 adults and 4 yo...,"[(young, children), (exclusive, experience), (..."
6,Restaurant looks nice but taste is bad. I had ...,"[(few, variety)]"
7,"Pros: ambience is great with lake view, good a...","[(good, restaurant), (due, weather), (hot, wai..."
8,We went to this place after reviews on Tripadv...,"[(disappointing, experience)]"
9,"the restaurant is located inside the hotel, th...","[(many, dishes)]"


### **B. Unsupervised Machine Learning**

#### B.1: Text Vectorization

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use a sample of reviews (or full if you're ready)
texts = sample_df['Review'].astype(str).tolist()

vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = vectorizer.fit_transform(texts)

#### B.2: Apply K-Means Clustering

In [12]:
from sklearn.cluster import KMeans

k = 3  # You can experiment with 2–5
kmeans = KMeans(n_clusters=k, random_state=42)
sample_df['Cluster'] = kmeans.fit_predict(X)

#### B.3: View Clustered Output

In [13]:
sample_df[['Review', 'Cluster']].head(10)

Unnamed: 0,Review,Cluster
0,Came here for the High Tea. Great service espe...,1
1,"5 stars for the service, even though some of t...",1
2,"Hi, thank you for your service. But! i feel so...",2
3,I have the worse buffer dinner ever so far. Th...,0
4,"That's are Known 5 Elmark "" 9H72 "" & KDK "" 3 K...",2
5,I just came back from there. 2 adults and 4 yo...,0
6,Restaurant looks nice but taste is bad. I had ...,2
7,"Pros: ambience is great with lake view, good a...",2
8,We went to this place after reviews on Tripadv...,1
9,"the restaurant is located inside the hotel, th...",0


In [14]:
for i in range(k):
    print(f"\nCluster {i} samples:")
    print(sample_df[sample_df['Cluster'] == i]['Review'].head(3).to_string(index=False))


Cluster 0 samples:
I have the worse buffer dinner ever so far. The...
I just came back from there. 2 adults and 4 you...
the restaurant is located inside the hotel, the...

Cluster 1 samples:
Came here for the High Tea. Great service espec...
5 stars for the service, even though some of th...
We went to this place after reviews on Tripadvi...

Cluster 2 samples:
Hi, thank you for your service. But! i feel so ...
That's are Known 5 Elmark " 9H72 " & KDK " 3 K1...
Restaurant looks nice but taste is bad. I had B...


### **C. Supervised Machine Learning**

#### C.1 Prepare Data

##### Create Sentiment Labels from Rating

In [18]:
# Remove neutral (3-star) reviews and make a copy to avoid warning
df = df[df['Rating'] != 3].copy()

# Assign sentiment
df.loc[:, 'SentimentEncoded'] = df['Rating'].apply(lambda x: 1 if x >= 4 else 0)

#### C.2 Train-Test Split and Classification

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Vectorize text
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = vectorizer.fit_transform(df['Review'].astype(str))
y = df['SentimentEncoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

Accuracy: 0.9327673593512418

Classification Report:
               precision    recall  f1-score   support

    Negative       0.80      0.51      0.62      4273
    Positive       0.94      0.98      0.96     35187

    accuracy                           0.93     39460
   macro avg       0.87      0.75      0.79     39460
weighted avg       0.93      0.93      0.93     39460

