# Read Data

Initial reading of the photos and labels from parquet files.

In [1]:
import os
import random
import re
import shutil

import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from pyarrow import Table


In [2]:
PHOTO_DIR = "../data/original/yelp_photos/photos"
PHOTO_JSON = "../data/original/yelp_photos/photos.json"
CLEANED_DIR = "../data/clean"
YELP_PARQUET = "../data/clean/yelp.gz"


## Photos

In [3]:
photo_ids = os.listdir(PHOTO_DIR)
photo_ids = [ i.removesuffix(".jpg") for i in photo_ids ]
print("Number of photos:", len(photo_ids))

Number of photos: 200098


In [4]:
print("Max char:", max((len(p) for p in photo_ids), default=0))
print("Min char:", min((len(p) for p in photo_ids), default=0))

Max char: 22
Min char: 22


In [5]:
print("Example:", photo_ids[random.randint(0, len(photo_ids) - 1)])

Example: EM1dV6exacL7GZ0hdVMEdQ


## Parquet

In [6]:
table = pq.read_table(YELP_PARQUET)

print("Schema:")
display(table.schema)

Schema:


photo_id: string
business_id: string
caption: string
label: string
name: string
address: string
city: string
state: string
postal_code: string
latitude: double
longitude: double
stars: double
review_count: int64
is_open: int64
attributes.GoodForKids: string
attributes.RestaurantsGoodForGroups: string
attributes.RestaurantsPriceRange2: string
attributes.BusinessAcceptsCreditCards: string
attributes.RestaurantsReservations: string
attributes.NoiseLevel: string
attributes.RestaurantsAttire: string
attributes.Caters: string
attributes.Alcohol: string
attributes.GoodForMeal: string
attributes.HasTV: string
attributes.BusinessParking: string
attributes.RestaurantsDelivery: string
attributes.Ambience: string
attributes.RestaurantsTakeOut: string
attributes.BikeParking: string
attributes.DriveThru: string
attributes.OutdoorSeating: string
attributes.ByAppointmentOnly: string
attributes.BusinessAcceptsBitcoin: string
attributes.WiFi: string
attributes.WheelchairAccessible: string
attributes.Res

In [7]:
print("Shape:", table.shape)

Shape: (200100, 54)


In [8]:
example = table.take([random.randint(0, table.shape[0] - 1)]).to_pandas().iloc[0]
display(example)

photo_id                                                            DzFXevEC7VvZElgAIyhhHw
business_id                                                         jyRQNFeaGhdZBgf4kkWFlA
caption                                                                                   
label                                                                                 food
name                                                                    Roe's Deli Station
address                                                                   5851 Park Blvd N
city                                                                         Pinellas Park
state                                                                                   FL
postal_code                                                                          33781
latitude                                                                          27.83959
longitude                                                                       -82.714191

In [9]:
example_id = example.photo_id

if example_id in photo_ids:
    print("Example id found in file system")

Example id found in file system


In [10]:
dataset = ds.dataset(YELP_PARQUET, format="parquet")

check_missing_ids = False
if check_missing_ids:
    missing = set()

    for batch in dataset.scanner(columns=["photo_id"]).to_batches():
        arr = batch.column(0)
        for v in arr.to_pylist():
            if v not in photo_ids:
                missing.add(v)
                print("Found missing id:", v)

    print("Any missing parquet ids on disk?", bool(missing))
    if missing:
        print("Sample missing:", len(missing))

# Data Cleaning

Processing the data to handle duplication and missing values.

In [11]:
photo_columns = [
    "photo_id", "business_id", "caption", "label"
]
business_columns = list(set(table.column_names) - set(photo_columns))
business_columns = [
    "business_id", *business_columns
]

# Split the table into photo_details and business_details
photo_details = table.select(photo_columns)
business_details = table.select(business_columns)

# Deduplicate business_details by business_id
business_details_df = business_details.to_pandas()
business_details_df = business_details_df.drop_duplicates(subset=["business_id"])

# Drop the index to avoid __index_level_0__
business_details_df.reset_index(drop=True, inplace=True)

# Convert back to PyArrow Table
business_details = Table.from_pandas(business_details_df)

# Remove schema metadata
business_details = business_details.replace_schema_metadata(None)


# Renaming

In [12]:
def to_snake_case(s):
    s = re.sub(r"([a-z])([A-Z])", r"\1_\2", s)
    s = re.sub(r"\W+", "_", s)
    return s.lower()

# Rename columns in the business_details table
new_column_names = []
for col in business_details.column_names:
    cleaned = col.removeprefix("attributes.")
    new_column_names.append(to_snake_case(cleaned))

# Apply the new column names
business_details = business_details.rename_columns(new_column_names)

print(business_details.schema)

business_id: string
restaurants_delivery: string
drive_thru: string
wi_fi: string
happy_hour: string
caters: string
byobcorkage: string
restaurants_price_range2: string
good_for_meal: string
coat_check: string
city: string
smoking: string
good_for_kids: string
address: string
restaurants_take_out: string
bike_parking: string
postal_code: string
restaurants_reservations: string
noise_level: string
wheelchair_accessible: string
business_accepts_bitcoin: string
good_for_dancing: string
restaurants_table_service: string
restaurants_counter_service: string
best_nights: string
name: string
state: string
restaurants_attire: string
dietary_restrictions: string
stars: double
corkage: string
byob: string
business_parking: string
by_appointment_only: string
is_open: int64
hair_specializes_in: string
open24hours: string
accepts_insurance: string
latitude: double
business_accepts_credit_cards: string
has_tv: string
music: string
alcohol: string
categories: string
dogs_allowed: string
longitude: dou

# Save new Parquet file

In [13]:
# Write the split tables to separate Parquet files
photo_details_path = f"{CLEANED_DIR}/photo_details.parquet"
business_details_path = f"{CLEANED_DIR}/business_details.parquet"

pq.write_table(photo_details, photo_details_path, compression="snappy")
pq.write_table(business_details, business_details_path, compression="snappy")

print(f"Photo details written to {photo_details_path}")
print(f"Business details written to {business_details_path}")

Photo details written to ../data/clean/photo_details.parquet
Business details written to ../data/clean/business_details.parquet


In [14]:
photo_details = pq.read_table(photo_details_path)
print(photo_details.schema)
print(photo_details.shape)


photo_id: string
business_id: string
caption: string
label: string
(200100, 4)


In [15]:
business_details = pq.read_table(business_details_path)
print(business_details.schema)
print(business_details.shape)


business_id: string
restaurants_delivery: string
drive_thru: string
wi_fi: string
happy_hour: string
caters: string
byobcorkage: string
restaurants_price_range2: string
good_for_meal: string
coat_check: string
city: string
smoking: string
good_for_kids: string
address: string
restaurants_take_out: string
bike_parking: string
postal_code: string
restaurants_reservations: string
noise_level: string
wheelchair_accessible: string
business_accepts_bitcoin: string
good_for_dancing: string
restaurants_table_service: string
restaurants_counter_service: string
best_nights: string
name: string
state: string
restaurants_attire: string
dietary_restrictions: string
stars: double
corkage: string
byob: string
business_parking: string
by_appointment_only: string
is_open: int64
hair_specializes_in: string
open24hours: string
accepts_insurance: string
latitude: double
business_accepts_credit_cards: string
has_tv: string
music: string
alcohol: string
categories: string
dogs_allowed: string
longitude: dou

## Duplication and Missing Values

In [16]:
# TODO

# Subset

In [17]:
sampled_photo_ids = random.sample(photo_ids, 200)

In [18]:
# Filter photo_details for the sampled photo_ids
photo_id_column = photo_details.column("photo_id").to_pandas()
mask = photo_id_column.isin(sampled_photo_ids)
arrow_mask = pa.array(mask)  # Convert the Pandas Series to a PyArrow array
reduced_photo_details = photo_details.filter(arrow_mask)

# Filter business_details for the business_ids corresponding to the sampled photo_ids
sampled_business_ids = reduced_photo_details.column("business_id").to_pandas().tolist()
business_id_column = business_details.column("business_id").to_pandas()
mask = business_id_column.isin(sampled_business_ids)
arrow_mask = pa.array(mask)  # Convert the Pandas Series to a PyArrow array
reduced_business_details = business_details.filter(arrow_mask)

# Write the reduced tables to new Parquet files
reduced_photo_details_path = f"{CLEANED_DIR}/reduced_photo_details.parquet"
reduced_business_details_path = f"{CLEANED_DIR}/reduced_business_details.parquet"

pq.write_table(reduced_photo_details, reduced_photo_details_path, compression="snappy")
pq.write_table(reduced_business_details, reduced_business_details_path, compression="snappy")

print(f"Reduced photo details written to {reduced_photo_details_path}")
print(f"Reduced business details written to {reduced_business_details_path}")

Reduced photo details written to ../data/clean/reduced_photo_details.parquet
Reduced business details written to ../data/clean/reduced_business_details.parquet


In [19]:
target_photo_dir = f"{CLEANED_DIR}/reduced_photos"

# Clean directory first
if os.path.exists(target_photo_dir):
    shutil.rmtree(target_photo_dir)

os.makedirs(target_photo_dir, exist_ok=True)

# Copy the sampled photos to the target directory
for photo_id in sampled_photo_ids:
    source_path = os.path.join(PHOTO_DIR, f"{photo_id}.jpg")
    target_path = os.path.join(target_photo_dir, f"{photo_id}.jpg")
    
    if os.path.exists(source_path):
        shutil.copy(source_path, target_path)
    else:
        print(f"Photo not found: {source_path}")

print(f"Copied {len(sampled_photo_ids)} photos to {target_photo_dir}")

Copied 200 photos to ../data/clean/reduced_photos
