<a href="https://colab.research.google.com/github/nantmoe-theingi/airbnb-nz-deception-sentiment/blob/main/notebooks/04_predict_airbnb_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [7]:
GITHUB_USER = "nantmoe-theingi"
REPO_NAME   = "airbnb-nz-deception-sentiment"
BRANCH      = "main"


REPO_URL = f"https://github.com/nantmoe-theingi/airbnb-nz-deception-sentiment.git"
PROJECT_DRIVE_DIR = "/content/drive/MyDrive/Colab Notebooks/airbnb_nz_deception_sentiment"

In [8]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.makedirs(PROJECT_DRIVE_DIR, exist_ok=True)
print("Drive project folder:", PROJECT_DRIVE_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive project folder: /content/drive/MyDrive/Colab Notebooks/airbnb_nz_deception_sentiment


In [9]:
pwd

'/content'

In [10]:
cd /content/drive/MyDrive/Colab\ Notebooks/airbnb_nz_deception_sentiment

/content/drive/MyDrive/Colab Notebooks/airbnb_nz_deception_sentiment


In [11]:
pwd

'/content/drive/MyDrive/Colab Notebooks/airbnb_nz_deception_sentiment'

In [12]:
# Step 1: DATA INTAKE & SCHEMA AUDIT
import pandas as pd
import numpy as np

REVIEWS_CSV  = "data/airbnb_nz_reviews.csv"
LISTINGS_CSV = "data/airbnb_nz_listings.csv"

# 1) Load the dataset both as strings
reviews  = pd.read_csv(REVIEWS_CSV, dtype=str, low_memory=False)
listings = pd.read_csv(LISTINGS_CSV, dtype=str, low_memory=False)

print("Loaded.")
print("reviews shape:", reviews.shape)
print("listings shape:", listings.shape)

Loaded.
reviews shape: (2951093, 6)
listings shape: (46645, 85)


In [13]:
# 2) Show column sets
print("\nreviews columns:", list(reviews.columns))
print("\nlistings columns:", list(listings.columns))


reviews columns: ['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments']

listings columns: ['id', 'listing_url', 'scrape_id', 'last_searched', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability'

In [14]:
# 3) Basic integrity checks on key columns
must_have_reviews = ["listing_id","id","date","comments"]
missing_cols_r = [c for c in must_have_reviews if c not in reviews.columns]
print("\nMissing essential review columns:", missing_cols_r)

must_have_listings = ["id","listing_url","price","property_type","room_type"]
missing_cols_l = [c for c in must_have_listings if c not in listings.columns]
print("Missing essential listing columns:", missing_cols_l)


Missing essential review columns: []
Missing essential listing columns: []


In [15]:
# 4) Quick data quality snapshots
# reviews
r_nonempty_text = reviews["comments"].notna().sum() if "comments" in reviews else np.nan
r_min_date = pd.to_datetime(reviews["date"], errors="coerce", dayfirst=True).min()
r_max_date = pd.to_datetime(reviews["date"], errors="coerce", dayfirst=True).max()
r_unique_listings = reviews["listing_id"].nunique() if "listing_id" in reviews else np.nan

print("\n--- Reviews snapshot ---")
print("Total reviews:", len(reviews))
print("Reviews with non-empty 'comments':", r_nonempty_text)
print("Unique listings referenced:", r_unique_listings)
print("Date range:", r_min_date, "to", r_max_date)

# listings
l_unique_ids = listings["id"].nunique() if "id" in listings else np.nan
print("\n--- Listings snapshot ---")
print("Total listings:", len(listings))
print("Unique listing ids:", l_unique_ids)


--- Reviews snapshot ---
Total reviews: 2951093
Reviews with non-empty 'comments': 2950839
Unique listings referenced: 41597
Date range: 2011-03-05 00:00:00 to 2025-12-07 00:00:00

--- Listings snapshot ---
Total listings: 46645
Unique listing ids: 46645


In [16]:
# 5) Null rates on a few critical fields
def null_rate(s):
    return float(s.isna().mean()*100)

summary_nulls = {}
for col in ["listing_id","id","date","comments","reviewer_id","reviewer_name"]:
    if col in reviews:
        summary_nulls[f"reviews.{col}"] = round(null_rate(reviews[col]), 2)
for col in ["id","price","property_type","room_type","latitude","longitude",
            "host_is_superhost","host_acceptance_rate","host_response_rate","amenities"]:
    if col in listings:
        summary_nulls[f"listings.{col}"] = round(null_rate(listings[col]), 2)

print("\n--- Null percentage (selected columns) ---")
for k,v in summary_nulls.items():
    print(f"{k}: {v}%")


--- Null percentage (selected columns) ---
reviews.listing_id: 0.0%
reviews.id: 0.0%
reviews.date: 0.0%
reviews.comments: 0.01%
reviews.reviewer_id: 0.0%
reviews.reviewer_name: 0.0%
listings.id: 0.0%
listings.price: 4.29%
listings.property_type: 0.0%
listings.room_type: 0.0%
listings.latitude: 0.0%
listings.longitude: 0.0%
listings.host_is_superhost: 2.04%
listings.host_acceptance_rate: 5.7%
listings.host_response_rate: 14.98%
listings.amenities: 0.0%


In [17]:
# 6) Preview a few rows that can cause issues
print("\nreviews.head():")
print(reviews.head(5).to_string(index=False)[:1200])
print("\nlistings.head():")
print(listings.head(5).to_string(index=False)[:1200])


reviews.head():
listing_id        id       date reviewer_id reviewer_name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     comments
     46071 506601469 2019-08-10   124689447          Mari                                                                                                                                                                                                                                                                                                                                                       

In [18]:
# 7) Check for duplicates
dup_reviews = reviews['id'].duplicated().sum()
dup_listings = listings['id'].duplicated().sum()

print(f"Duplicate review IDs: {dup_reviews} ({dup_reviews / len(reviews) * 100:.4f}%)")
print(f"Duplicate listing IDs: {dup_listings} ({dup_listings / len(listings) * 100:.4f}%)")

Duplicate review IDs: 0 (0.0000%)
Duplicate listing IDs: 0 (0.0000%)


In [19]:
# 8) Cross-reference consistency
linked_ratio = reviews['listing_id'].isin(listings['id']).mean() * 100
print(f"Listing–review linkage consistency: {linked_ratio:.2f}%")

Listing–review linkage consistency: 100.00%


In [20]:
pip install langdetect



In [21]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 42

sample = reviews['comments'].dropna().sample(500, random_state=42)
non_english = sum(detect(c) != 'en' for c in sample)
print(f"Non-English rate (sampled): {non_english / len(sample) * 100:.2f}%")

Non-English rate (sampled): 5.20%


In [25]:
# 10) Schema validation
required_reviews_cols = ["listing_id", "id", "date", "reviewer_id", "reviewer_name", "comments"]
required_listings_cols = ["id", "price", "property_type", "room_type", "host_is_superhost"]

missing_reviews = [c for c in required_reviews_cols if c not in reviews.columns]
missing_listings = [c for c in required_listings_cols if c not in listings.columns]

print("Missing in reviews:", missing_reviews)
print("Missing in listings:", missing_listings)

# 11) Select key columns for analysis
reviews_df = reviews[required_reviews_cols].copy()
listings_df = listings[required_listings_cols].copy()

print("Reviews columns selected:", list(reviews_df.columns))
print("Listings columns selected:", list(reviews_df.columns))

Missing in reviews: []
Missing in listings: []
Reviews columns selected: ['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments']
Listings columns selected: ['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments']


In [23]:
# STEP 2: DATA CLEANING & FILTERING (with full English filtering)
import pandas as pd
import re
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 42  # ensure reproducibility

# Helper functions
def normalize_whitespace(s):
    """Trim, collapse whitespace, and remove zero-width chars."""
    zws = r'[\u200B-\u200D\uFEFF]'
    s = s.astype(str).str.replace(zws, '', regex=True)
    return s.str.replace(r'\s+', ' ', regex=True).str.strip()

print("STEP 2: DATA CLEANING & FILTERING")

# Normalize key columns
for c in ["listing_id","id","date","reviewer_id","reviewer_name","comments"]:
    reviews_df[c] = normalize_whitespace(reviews_df[c])
print("Normalized whitespace and removed control characters from key columns.")

# Drop rows with missing essential fields
essential = ["listing_id","id","date","comments"]
before = len(reviews_df)
missing_mask = reviews_df[essential].isna().any(axis=1) | (reviews_df[essential] == "").any(axis=1)
removed_missing = missing_mask.sum()
reviews_df = reviews_df.loc[~missing_mask].copy()
print(f"[Missing Values] Removed {removed_missing:,} rows → {len(reviews_df):,} remain.")

# Remove duplicate review IDs
dup_count = reviews_df["id"].duplicated(keep="first").sum()
reviews_df = reviews_df.drop_duplicates(subset=["id"], keep="first").copy()
print(f"[Duplicates] Removed {dup_count:,} duplicate review IDs → {len(reviews_df):,} remain.")

STEP 2: DATA CLEANING & FILTERING
Normalized whitespace and removed control characters from key columns.
[Missing Values] Removed 0 rows → 2,951,093 remain.
[Duplicates] Removed 0 duplicate review IDs → 2,951,093 remain.


In [24]:
def is_symbol_only(text):
    """Return True if text has no alphabetic characters (only emojis/symbols)."""
    if not isinstance(text, str) or text.strip() == "":
        return True
    # Use a regex pattern compatible with Python's re module
    return not bool(re.search(r'[^\W\d_]', text))

# Remove blank / symbol-only comments
symbol_only_mask = reviews_df["comments"].apply(is_symbol_only)
removed_symbols = symbol_only_mask.sum()
reviews_df = reviews_df.loc[~symbol_only_mask].copy()
print(f"[Symbol-only Comments] Removed {removed_symbols:,} rows → {len(reviews_df):,} remain.")

[Symbol-only Comments] Removed 5,794 rows → 2,945,299 remain.


In [None]:
# Detect and remove non-English reviews
import math
import numpy as np
from tqdm import tqdm
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 42

def detect_language_safe(text):
    try:
        lang = detect(text)
        return lang if lang else "other"
    except Exception:
        return "other"

def detect_language_in_chunk(df_chunk):
    return df_chunk["comments"].apply(detect_language_safe)

# configure chunking
N = len(reviews_df)
chunksize = 200_000
num_chunks = math.ceil(N / chunksize)

langs = []
print(f"Running language detection in {num_chunks} chunk(s) of ~{chunksize:,} rows...")

for i in tqdm(range(num_chunks), desc="Chunks"):
    start = i * chunksize
    stop  = min((i + 1) * chunksize, N)
    chunk = reviews_df.iloc[start:stop].copy()
    langs.append(detect_language_in_chunk(chunk))

# concatenate detected languages
reviews_df["lang"] = pd.concat(langs, axis=0).reset_index(drop=True)

# sanity check alignment
assert len(reviews_df["lang"]) == len(reviews_df), "Length mismatch after chunking!"

# distribution + filter
lang_pct = reviews_df["lang"].value_counts(normalize=True).mul(100).round(2)
print("Language distribution (%):")
print(lang_pct.to_string())

non_en_removed = (reviews_df["lang"] != "en").sum()
reviews_df = reviews_df.loc[reviews_df["lang"] == "en"].drop(columns=["lang"]).copy()
print(f"[Language Filter] Removed {non_en_removed:,} non-English reviews → {len(reviews_df):,} remain.")

Running language detection in 15 chunk(s) of ~200,000 rows...


Chunks:  40%|████      | 6/15 [45:17<1:07:50, 452.22s/it]

In [3]:
# Remove duplicate comments
dup_comments = reviews_df["comments"].duplicated(keep="first").sum()
reviews_df = reviews_df.drop_duplicates(subset=["comments"], keep="first").copy()
print(f"[Duplicate Comments] Removed {dup_comments:,} identical comment texts → {len(reviews_df):,} remain.")

NameError: name 'reviews_df' is not defined

In [None]:
print("STEP 2 COMPLETED")
print(f"Final English-only reviews: {len(reviews):,}")
print(f"Final listings: {len(listings):,}")

In [None]:
import os, shutil, subprocess

# Clean old clone if re-running
if os.path.exists(f"/content/{REPO_NAME}"):
    shutil.rmtree(f"/content/{REPO_NAME}")

!git clone $REPO_URL
%cd /content/$REPO_NAME
!git checkout $BRANCH || git checkout -b $BRANCH
!git status

Cloning into 'airbnb-nz-deception-sentiment'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 22 (delta 4), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (22/22), 10.30 KiB | 10.30 MiB/s, done.
Resolving deltas: 100% (4/4), done.
/content/airbnb-nz-deception-sentiment
Already on 'main'
Your branch is up to date with 'origin/main'.
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [1]:
pwd

'/content'

In [None]:
from getpass import getpass
import os, textwrap

print("Create a GitHub Personal Access Token (classic) with 'repo' scope.")
print("GitHub → Settings → Developer settings → Personal access tokens → Tokens (classic).")
GITHUB_TOKEN = getpass("Paste your GitHub token (input hidden): ")

# Store credentials for this runtime
!git config --global credential.helper store
with open(os.path.expanduser("~/.git-credentials"), "w") as f:
    f.write(f"https://{GITHUB_USER}:{GITHUB_TOKEN}@github.com\n")

# Basic identity
!git config --global user.name $GITHUB_USER
!git config --global user.email "nantmoetheingi@gmail.com"

print("Credentials configured for this runtime.")

Create a GitHub Personal Access Token (classic) with 'repo' scope.
GitHub → Settings → Developer settings → Personal access tokens → Tokens (classic).
Paste your GitHub token (input hidden): ··········
Credentials configured for this runtime.


In [None]:
import os
for p in ["data", "datasets", "models", "runs", "outputs", "figures"]:
    os.makedirs(os.path.join(PROJECT_DRIVE_DIR, p), exist_ok=True)

print("Drive artifact folders ready under:", PROJECT_DRIVE_DIR)

Drive artifact folders ready under: /content/drive/MyDrive/Colab Notebooks/airbnb_nz_deception_sentiment


In [None]:
import shutil, os


REPO_URL = f"https://github.com/nantmoe-theingi/airbnb-nz-deception-sentiment.git"

if os.path.exists(f"/content/{REPO_NAME}"):
    shutil.rmtree(f"/content/{REPO_NAME}")

!git clone $REPO_URL
%cd /content/$REPO_NAME
!git checkout $BRANCH || git checkout -b $BRANCH
!git status

Cloning into 'airbnb-nz-deception-sentiment'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 25 (delta 5), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 12.46 KiB | 4.15 MiB/s, done.
Resolving deltas: 100% (5/5), done.
/content/airbnb-nz-deception-sentiment
Already on 'main'
Your branch is up to date with 'origin/main'.
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [25]:
pwd

'/content/drive/MyDrive/Colab Notebooks/airbnb_nz_deception_sentiment'

In [None]:
%cd /content/$REPO_NAME/notebooks/

/content/airbnb-nz-deception-sentiment/notebooks


In [None]:
pwd

'/content/airbnb-nz-deception-sentiment/notebooks'