In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np
import pandas as pd
from transformers import pipeline
from sklearn.feature_selection import f_classif
import warnings
warnings.filterwarnings('ignore')

# Load Dataset

In [7]:
df = pd.read_csv("final_data/test_dataset.csv") 
df.head()

Unnamed: 0,review_id,user_id,user_name,time,rating,review,gmap_id,business_name,latitude,longitude,...,review_length_words,review_embedding,business_desc_embedding,name_validity,review_similarity,category_similarity,avg_review_rating_by_user,std_review_rating_by_user,posts_per_year,count_significantly_different_ratings
0,2,1.06e+20,Steven DeRyck [Staff],2018-10-20 01:46:40,4.0,"As previous reviews have stated, two small pie...",0x80c8c415f0a42c77:0x55c554fdc4ad8b9c,Carnegie Deli,36.120556,-115.173611,...,39,[-3.98575701e-02 4.56756018e-02 3.53269391e-...,[-4.08036783e-02 2.61357054e-02 3.17493156e-...,True,0.0,0.0,4.155598,1.368606,37.642857,147.0
1,8,1.04e+20,Anthony V,2019-10-02 07:06:40,5.0,Good fresh food... always clean and employees ...,0x80c8be54c348aa77:0xa367ab6e87cde35f,MTO Cafe,36.150923,-115.332874,...,15,[-7.99094662e-02 5.03363349e-02 9.46498215e-...,[ 6.66201115e-02 -6.61623925e-02 -3.28047089e-...,True,0.0,0.0,4.166031,1.300504,32.75,140.0
2,21,1.06e+20,Bobbie Broberg,2020-01-26 00:53:20,5.0,The soups are delicious the desserts are fanta...,0x80c8c022b77665e3:0xa79cb8b785dbcc80,Grape Vine Cafe,36.194653,-115.258707,...,31,[-9.30541828e-02 1.16505772e-02 4.51557487e-...,[-6.29015565e-02 2.39341287e-03 -2.90478971e-...,True,0.0,0.0,4.155598,1.368606,37.642857,147.0
3,23,1.03e+20,Susan Tomlinson,2019-06-08 13:20:00,5.0,Great selection great prices. Has a teacher di...,0x80c8c51f0cfc862f:0x650feb80fd6831f5,Zia Records (Eastern Las Vegas),36.11163,-115.11967,...,8,[-1.69371173e-03 -3.60540822e-02 3.35343517e-...,[-1.70421153e-02 -5.87579655e-03 -2.32800245e-...,True,0.0,0.0,4.198556,1.275842,46.166667,155.0
4,25,1.04e+20,Ana Velasquez,2019-10-02 07:06:40,4.0,Fun!!! Fun!!! Love Laughlin it's just perfect ...,0x80ce42fe267b7c5f:0xc9760accf78f9834,Don Laughlin's Riverside Resort Hotel and Casino,35.167263,-114.571827,...,29,[ 4.49643880e-02 3.50650656e-03 1.61046479e-...,[ 2.05648784e-02 6.40424192e-02 -1.20113783e-...,False,0.0,0.0,4.166031,1.300504,32.75,140.0


# Model 1: Rule-Based

All codes here are from the notebook `step6a_rule_based.ipynb`

In [8]:
df['label_rules'] = "relevant"

# Detect ads
def detect_ads_leave_as_is(row):
    text = row['review']
    current_label = row['label_rules']
    
    if pd.isna(text):
        return current_label  # leave as is
    
    # Check for URLs
    if re.search(r'http\S+|www\S+|https\S+', text, flags=re.MULTILINE):
        return "ads"
    # Check for phone numbers
    if re.search(r'\+?\d[\d\-\(\) ]{7,}\d', text):
        return "ads"
    
    return current_label  # leave as is if not detected

# Apply function
df['label_rules'] = df.apply(detect_ads_leave_as_is, axis=1)

# Detect rants
RANT_PATTERNS = [
    r"\bnever been\b",
    r"\bnever visited\b",
    r"\bnot been\b",
    r"\bhavent visited\b",
    r"\bdon'?t know (this|the) place\b",
    r"\bdon'?t know what (they|this) do(es)?\b",
    r"\bheard about this place\b"
]
rant_re = re.compile("|".join(RANT_PATTERNS), re.I)

# Define exceptions (phrases that imply they *have* visited)
RANT_EXCEPTIONS = [
    "in a while",
    "again",
    "before",
    "recently",
    "so decided to check it out",
    "so i went",
    "so i tried",
    "so i checked"
]

def is_rant(text: str) -> bool:
    if not text or not isinstance(text, str):
        return False
    
    t = text.lower()

    # Step 1: does it contain any rant trigger?
    if not rant_re.search(t):
        return False

    # Step 2: check if any exception is present
    for exc in RANT_EXCEPTIONS:
        if exc in t:
            return False

    # If we found a trigger and no exception => classify as rant
    return True

df["is_rant"] = df["review"].apply(is_rant)

df.loc[(df["is_rant"]) & (df["label_rules"] == "relevant"), "label_rules"] = "rant"

df[df["is_rant"] & (df["label"] == "relevant")]["review"]

# Second way to detect rants
mask = (df["label_rules"] == "relevant") & (df["sentiment_label"] == "Negative")
negative_relevant = df[mask].copy()

# Step 2: Compute 50th percentile of sentiment_score for this subset
score_threshold = negative_relevant["sentiment_score"].quantile(0.5)

# Step 3: Label top 50% as 'rant'
negative_relevant.loc[negative_relevant["sentiment_score"] > score_threshold, "label_rules"] = "rant"

# Step 4: Update original DataFrame
df.update(negative_relevant)

# Detect spam
threshold = df['repetitiveness_score'].quantile(0.90)
df.loc[(df['repetitiveness_score'] > threshold)&(df['label_rules']=='relevant') , 'label_rules'] = 'spam'

# Detect irrelevant 1
emoji_pattern = re.compile(
    r'^[\U0001F600-\U0001F64F'  # emoticons
    r'\U0001F300-\U0001F5FF'  # symbols & pictographs
    r'\U0001F680-\U0001F6FF'  # transport & map symbols
    r'\U0001F1E0-\U0001F1FF'  # flags
    r'\U00002700-\U000027BF'  # dingbats
    r'\U0001F900-\U0001F9FF'  # supplemental symbols & pictographs
    r'\U00002600-\U000026FF'  # miscellaneous symbols
    r'\U00002B00-\U00002BFF'  # miscellaneous symbols & arrows
    r'\U0000200D'              # zero-width joiner
    r'\U000024C2-\U0001F251]+$'
)

# Function to check if review has only emojis
def is_emoji_only(text):
    if pd.isna(text):
        return False
    text = text.strip()
    return bool(emoji_pattern.fullmatch(text))

# Update label to 'irrelevant' if review is only emojis
df.loc[(df['review'].apply(is_emoji_only))&(df['label_rules']=='relevant'), 'label_rules'] = 'irrelevant'

# Detect irrelevant 2
def parse_embedding(s):
    # Remove brackets and turn into numpy array of floats
    return np.fromstring(s.strip("[]"), sep=" ")

# Apply parsing
df["business_desc_embedding"] = df["business_desc_embedding"].apply(parse_embedding)
df["review_embedding"] = df["review_embedding"].apply(parse_embedding)

# Cosine similarity function
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Compute cosine similarity
df["cosine_similarity"] = df.apply(
    lambda row: cosine_sim(row["business_desc_embedding"], row["review_embedding"]),
    axis=1
)

# Filter rows where label_rules is 'relevant' and cosine_similarity < 0.23
mask = (df["label_rules"] == "relevant") & (df["cosine_similarity"] < 0.05)

# Replace label with 'irrelevant' for those rows
df.loc[mask, "label_rules"] = "irrelevant"

# Check changes
df[mask][["label_rules", "cosine_similarity", "label"]].head()

# Detect irrelevant 3
mask = (df["label_rules"] == "relevant") & (df["sentiment_label"] == "Negative")
negative_relevant = df[mask].copy()

score_threshold = negative_relevant["sentiment_score"].quantile(0.5)

negative_relevant.loc[negative_relevant["sentiment_score"] < score_threshold, "label_rules"] = "irrelevant"

df.update(negative_relevant)

# Output Test Results

In [9]:
final_rule_based = df[['review_id', 'label', 'label_rules']].copy()
final_rule_based = final_rule_based.rename(columns={'label_rules': 'pred', 'label': 'true_label'})

# One-hot encode predicted_label with 0/1 instead of True/False
one_hot = pd.get_dummies(final_rule_based['pred'], dtype=int)

# Concatenate back
final_rule_based = pd.concat([final_rule_based, one_hot], axis=1)

# Model 2: DNN