## Overview
1. Randomly sample across 3-5 states (e.g., CA, TX, NY, IL, FL) to capture diverse review patterns (urban vs rural, business types, language usage). We sample about 80k-100k non-empty reviews for investigation due to time constraints.

2. Sample 500-1000 reviews manually across different states and business types. Aim to balance the noise and non-noise review data for best performance.

1. We deployed a light-weight LLM model for pseudo ground-truth labelling.

2. Perform resampling on the labelled data to prepare for model training.

The step of mounting to google drive could be deleted, or replaced with a step of setting the work directory to where you want.

In [None]:
# # Mount Gdrive
# from google.colab import drive
# drive.mount('/content/drive')
# import os

# os.chdir('/content/drive/My Drive/TikTok Hackathon/data gzip')

Mounted at /content/drive


## Data Collection

The dataset used for this project is the **Google Local Reviews dataset**. Due to its gigantic volume, we performed random sampling to select a small dataset from it.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [None]:
meta_California = 'meta-California.json.gz'
meta_Illinois = 'meta-Illinois.json.gz'
meta_Louisana = 'meta-Louisiana.json.gz'
meta_Texas = 'meta-Texas.json.gz'
meta_NewYork = 'meta-New_York.json.gz'

review_California = 'review-California_10.json.gz'
review_Illinois = 'review-Illinois_10.json.gz'
review_Louisana = 'review-Louisiana_10.json.gz'
review_NewYork = 'review-New_York_10.json.gz'
review_Texas = 'review-Texas_10.json.gz'

Test examine New York data

In [None]:
df = pd.read_json(meta_NewYork, compression='gzip', lines=True)

# Preview the first few rows
print(df.head())

We randomly sample 0.1% from the entire dataset.



In [None]:
import json
import random

# Input / output files
input_file = "dataset.json" #modify path
output_file = "sampled_dataset.json"

# Load JSON data
with open(input_file, "r") as f:
    data = json.load(f)

# Make sure data is a list
if not isinstance(data, list):
    raise ValueError("JSON root must be a list of records.")

# Calculate sample size (at least 1 if dataset is small)
sample_size = max(1, int(0.001 * len(data)))

# Randomly sample 1%
sampled_data = random.sample(data, sample_size)

# Save to new JSON file
with open(output_file, "w") as f:
    json.dump(sampled_data, f, indent=2)

print(f"Sampled {sample_size} records out of {len(data)} into {output_file}")


### Util: gzip reader in chunk

In [None]:
import pandas as pd
import gzip
import json

def gz_to_df(input_path):
    if input_path.endswith('.json.gz'):
        with gzip.open(input_path, 'rt', encoding='utf-8') as f:
            data = f.readlines()

    data = map(lambda x: x.rstrip(), data)
    data_json_str = "[" + ','.join(data) + "]"
    data_df = pd.read_json(data_json_str)
    return data_df

def gz_to_df_chunks(input_path, chunk_size=100, threshold=5000):
    print("reading json in chunks")
    if not input_path.endswith('.json.gz'):
        raise ValueError("Input file must be a .json.gz file")

    records = []
    num_chunks = 0
    with gzip.open(input_path, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if num_chunks > threshold/chunk_size:
              break
            records.append(json.loads(line))
            if (i + 1) % chunk_size == 0:
                print(f"producing chunk {int((i + 1) / chunk_size)}")
                yield pd.DataFrame(records)
                num_chunks += 1
                records = []
        # Yield any remaining records as the last chunk
        if records:
            print(f"producing final chunk")
            yield pd.DataFrame(records)

## Data cleaning and combining
Combining review data and metadata, joining on `gmap_id`

In [None]:
def clean_combine_chunk(review_df, meta_df):
    columns_to_keep = ['gmap_id', 'name', 'category', 'description', 'avg_rating']
    cleaned_meta = meta_df[columns_to_keep].drop_duplicates(subset="gmap_id", keep="first")
    columns_to_keep = ['rating', 'text', 'gmap_id']
    # Drop rows with null or empty or blank 'text' field
    cleaned_review = review_df[review_df['text'].notnull() & (review_df['text'].str.strip() != '')]
    print(f"number of non-empty reviews:", {len(cleaned_review)})
    cleaned_review = cleaned_review.loc[:, columns_to_keep]
    print(f"cleaned review size: {len(cleaned_review)}")
    # remove new line symboles
    cleaned_review["text"] = cleaned_review["text"].str.replace(r"[\r\n]+", " ", regex=True)
    # Extract translated text
    pattern = r"\(Translated by Google\) (.*?) \(Original\)"
    cleaned_review["text"] = cleaned_review["text"].str.extract(pattern)[0].fillna(cleaned_review["text"])

    combined_df = pd.merge(cleaned_review, cleaned_meta, on='gmap_id', how='inner')
    print(f"combined df size: {len(combined_df)}")
    return combined_df

def clean_chunk(review_df):
  columns_to_keep = ['rating', 'text', 'gmap_id']
  # Drop rows with null or empty or blank 'text' field
  cleaned_review = review_df[review_df['text'].notnull() & (review_df['text'].str.strip() != '')]
  print(f"number of non-empty reviews:", {len(cleaned_review)})
  cleaned_review = cleaned_review.loc[:, columns_to_keep]
  return cleaned_review

In [None]:
meta_df = gz_to_df(meta_NewYork)
chunks = gz_to_df_chunks(review_NewYork, chunk_size=10, threshold=1)
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}")
    cleaned_chunk = clean_combine_chunk(chunk, meta_df)
    print(f"Chunk after: {cleaned_chunk}")

  data_df = pd.read_json(data_json_str)


reading json in chunks
producing chunk 1
Processing chunk 1
number of non-empty reviews: {10}
cleaned review size: 10
combined df size: 10
Chunk after:    rating                                               text  \
0       5  I'm late to posting this but this store especi...   
1       1  Very dissatisfied I did not get my phone the p...   
2       5  Excellent very well done with professional car...   
3       5  Basing my review strictly on the service I rec...   
4       1  Bad! Disorganized. I'm being totally honest. I...   
5       1  Worse customer ever ! More then 30min to make ...   
6       5  Excellent very well done with professional car...   
7       1  Worse customer ever ! More then 30min to make ...   
8       5                                    Very good store   
9       5  Thank you, prompt and knowledgeable attention....   

                               gmap_id      name  \
0  0x89c25fc9494dce47:0x6d63c807b59a55  T-Mobile   
1  0x89c25fc9494dce47:0x6d63c807b59a55 

## Pseudo Ground-truth Labelling
Perform zero-shot-classification on batched sample data, detecting noise reviews.

In [None]:
!pip install transformers
!pip install pyarrow


In [None]:
!pip install -U transformers torch accelerate


In [None]:
import pandas as pd
from transformers import pipeline
import signal

In [None]:
relevant = 0
ads = 0
no_experience = 0
irrelevant = 0

def label_chunk(df_chunk, classifier, candidate_labels, threshold):
  global relevant, ads, no_experience, irrelevant
  print(f"processing batch")
  predictions = []

  for _, row in chunk.iterrows():
      # Combine text + rating for model context
      input_text = f"Business Description: {row['name']}, {row['category']}, {row['description']}, average rating = {row['avg_rating']}. User's review: {row['text']}, rating = {row['rating']}/5."
      result = classifier(input_text, candidate_labels=labels)
      # If the score for selected label is above threshold, keep it in the final predictions
      if result["scores"][0] > threshold:
          if result["labels"][0] == "relevant review":
              relevant += 1
          elif result["labels"][0] == "advertisement / spam":
              ads += 1
          elif result["labels"][0] == "no first-hand experience":
              no_experience += 1
              # If the score for selected label is above threshold, keep it in the final predictions
          elif result["labels"][0] == "irrelevant / off-topic":
              irrelevant += 1
          pass
          # Put the index of the row
          predictions.append({
              "text": row["text"],
              "rating": row["rating"],
              "category": result["labels"][0],
              "score": result["scores"][0],
              "label": "relevant" if result["labels"][0] == "relevant review" else "irrelevant"
          })
  return predictions

def label_chunk_naiive(df_chunk, classifier, candidate_labels, threshold):
  global relevant, ads, no_experience, irrelevant
  print(f"processing batch")
  predictions = []

  for _, row in chunk.iterrows():
      # Combine text + rating for model context
      input_text = f"User's review: {row['text']}, rating = {row['rating']}/5."
      result = classifier(input_text, candidate_labels=labels)
      # If the score for selected label is above threshold, keep it in the final predictions
      if result["scores"][0] > threshold:
          if result["labels"][0] == "relevant review":
              relevant += 1
          elif result["labels"][0] == "advertisement / spam":
              ads += 1
          elif result["labels"][0] == "no first-hand experience":
              no_experience += 1
              # If the score for selected label is above threshold, keep it in the final predictions
          elif result["labels"][0] == "irrelevant / off-topic":
              irrelevant += 1
          pass
          # Put the index of the row
          predictions.append({
              "text": row["text"],
              "rating": row["rating"],
              "category": result["labels"][0],
              "score": result["scores"][0],
              "label": "relevant" if result["labels"][0] == "relevant review" else "irrelevant"
          })
  return predictions


In [None]:
# Open a parquet file to store results
results_file = "labeled_reviews.parquet"
results_df = pd.DataFrame()

chunk_size = 100
threshold_total = 10  # Total number of reviews to process

print("loading df")
meta_df = gz_to_df(meta_NewYork)
chunks = gz_to_df_chunks(review_NewYork, chunk_size=chunk_size)

print("initializing classifier")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
labels = ["relevant review", "advertisement / spam", "no first-hand experience", "irrelevant / off-topic"]

all_results = []
threshold = 0.5
relevant = 0
ads = 0
no_experience = 0
irrelevant = 0

stop_requested = False

def handle_interrupt(signum, frame):
    global stop_requested
    print("\nManual stop requested. Will finish current chunk and exit gracefully.")
    stop_requested = True



loading df


  data_df = pd.read_json(data_json_str)


initializing classifier


Device set to use cpu


### For running cleaned combined data

In [None]:
signal.signal(signal.SIGINT, handle_interrupt)

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}")
    chunk = clean_combine_chunk(chunk, meta_df)
    print(f"Chunk size after cleaning: {len(chunk)}")
    if not chunk.empty:
        predictions = label_chunk(chunk, classifier, labels, threshold)
        all_results.extend(predictions)
    if stop_requested:
        print("Stopping after current chunk.")
        break

# Convert results to DataFrame
results_df = pd.DataFrame(all_results)
# Save results to parquet file
results_df.to_parquet(results_file, index=False)
# Print number of each label
print(results_df['category'].value_counts())

for label in results_df['category'].unique():
    print(f"\nLabel: {label}")
    print(results_df[results_df['category'] == label].head(10))

### For simple labeling

In [None]:
signal.signal(signal.SIGINT, handle_interrupt)

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}")
    chunk = clean_chunk(chunk)
    print(f"Chunk size after cleaning: {len(chunk)}")
    if not chunk.empty:
        predictions = label_chunk_naiive(chunk, classifier, labels, threshold)
        all_results.extend(predictions)
        print(f"relevant: {relevant}, ads: {ads}, no_experience: {no_experience}, irrelevant: {irrelevant}")
    if stop_requested:
        print("Stopping after current chunk.")
        break

# Convert results to DataFrame
results_df = pd.DataFrame(all_results)
# Save results to parquet file
results_df.to_parquet(results_file, index=False)
# Print number of each label
print(results_df['category'].value_counts())

for label in results_df['category'].unique():
    print(f"\nLabel: {label}")
    print(results_df[results_df['category'] == label].head(10))

reading json in chunks
producing chunk 1
Processing chunk 1
number of non-empty reviews: {51}
Chunk size after cleaning: 51
processing batch
relevant: 49, ads: 0, no_experience: 1, irrelevant: 1
producing chunk 2
Processing chunk 2
number of non-empty reviews: {75}
Chunk size after cleaning: 75
processing batch
relevant: 123, ads: 0, no_experience: 2, irrelevant: 1
producing chunk 3
Processing chunk 3
number of non-empty reviews: {59}
Chunk size after cleaning: 59
processing batch
relevant: 182, ads: 0, no_experience: 2, irrelevant: 1
producing chunk 4
Processing chunk 4
number of non-empty reviews: {56}
Chunk size after cleaning: 56
processing batch
relevant: 238, ads: 0, no_experience: 2, irrelevant: 1
producing chunk 5
Processing chunk 5
number of non-empty reviews: {62}
Chunk size after cleaning: 62
processing batch
relevant: 297, ads: 0, no_experience: 4, irrelevant: 1
producing chunk 6
Processing chunk 6
number of non-empty reviews: {55}
Chunk size after cleaning: 55
processing b

### Import synthetic data for ads/spam and irrelevant/off-topic reviews

There are very little noise reviews in the dataset. We therefore generate some synthetic data to balance the dataset.

In [None]:
import pandas as pd
import numpy as np

ads_spam = [
    "Get 50% off designer sunglasses today! Visit www.discountsun.com now!",
    "Earn $1000 per day working from home. Click here for details.",
    "Congratulations! You’ve been selected for a free cruise. Claim now.",
    "Buy followers instantly and grow your profile fast.",
    "Lose weight in 2 weeks with this miracle pill. Order now!",
    "Click here to claim your free iPhone today!",
    "Don’t miss this investment opportunity. Double your money fast.",
    "Sign up for unlimited movie streaming. First month free!",
    "Exclusive deal! Buy one get one free at our online store.",
    "Visit www.hotdeals.com for daily discounts you can’t miss.",
    "Win big in our lottery. Just send your email to enter.",
    "Cheap prescription meds without a doctor visit. Shop now!",
    "Guaranteed credit approval. Apply online in 2 minutes!",
    "Limited time offer: free shipping on all products.",
    "Download this app to earn rewards instantly.",
    "Claim your cash prize now. Limited spots left!",
    "Upgrade your internet speed for just $9.99/month.",
    "Get rich quick with this proven business system.",
    "Free membership to our dating site for 7 days.",
    "Save 70% on electronics. Today only!",
    "Want clear skin? Try this new cream now.",
    "Hot singles in your area are waiting to chat.",
    "Protect your computer with our free antivirus download.",
    "Join our VIP club and get instant perks.",
    "Affordable travel packages available now. Book today!",
    "Work from anywhere and earn six figures annually.",
    "Claim your gift card before it expires.",
    "Unlock premium features with our special promo code.",
    "Congratulations! You are our lucky winner.",
    "Hurry, stocks running out fast. Buy before it’s gone!"
]

irrelevant = [
    "I just bought a new vacuum cleaner and it works great.",
    "The weather here has been crazy hot lately.",
    "I’m so excited for the next Marvel movie release.",
    "This phone case I ordered online is super durable.",
    "I’ve been baking bread every weekend—so relaxing.",
    "My dog learned a new trick yesterday!",
    "I stayed up late watching soccer last night.",
    "The traffic in my city is unbearable during rush hour.",
    "I think pineapple on pizza actually tastes amazing.",
    "I need recommendations for good sci-fi books.",
    "Just finished painting my living room blue.",
    "Does anyone else still play old Nintendo games?",
    "I went hiking in the mountains last weekend.",
    "Currently obsessed with this new coffee blend.",
    "I can’t believe how expensive gas has become.",
    "Got my car washed today and it looks brand new.",
    "Thinking of adopting another cat soon.",
    "This online class I’m taking is super helpful.",
    "I want to learn how to play the guitar.",
    "Been experimenting with photography—so fun!",
    "I’m rewatching my favorite TV series from childhood.",
    "Started going to the gym again after months.",
    "My laptop battery drains way too quickly.",
    "I’ve been gardening a lot this spring.",
    "My kid just started kindergarten today.",
    "I’m craving sushi right now.",
    "Learning a new language is harder than I expected.",
    "I can’t wait for my vacation next month.",
    "Been binge-watching documentaries on space.",
    "Trying out a new skincare routine this week."
]

# Build dataset
data = []
for text in ads_spam:
    data.append({"text": text, "category": "advertisement / spam"})
for text in irrelevant:
    data.append({"text": text, "category": "irrelevant / off-topic"})

synthetic_df = pd.DataFrame(data)
synthetic_df["rating"] = np.random.randint(1, 6, size=len(df))
synthetic_df["label"] = "irrelevant"

print(df.head())
print(f"\nTotal rows: {len(df)}")

                                                text  category
0  Get 50% off designer sunglasses today! Visit w...  ads_spam
1  Earn $1000 per day working from home. Click he...  ads_spam
2  Congratulations! You’ve been selected for a fr...  ads_spam
3  Buy followers instantly and grow your profile ...  ads_spam
4  Lose weight in 2 weeks with this miracle pill....  ads_spam

Total rows: 60


In [None]:
synthetic_df.to_parquet("synthetic_ads_irrelevant.parquet", index=False)