# Preprocessing of data 
#
 

## Install and import packages

In [10]:
import pandas as pd
import random
import gzip, json
import numpy as np

In [11]:
def parse(path):
    with gzip.open(path, 'r') as g:
        for l in g:
            yield json.loads(l)


def reservoir_sample_jsonl_gz(path, k, seed=42):
    rng = random.Random(seed)
    R = []  # reservoir
    for i, rec in enumerate(parse(path), 1):
        if i <= k:
            R.append(rec)
        else:
            j = rng.randint(1, i)
            if j <= k:
                R[j-1] = rec
    return R

In [12]:
meta_files = [
    "meta-Alaska.json.gz",
    "meta-Delaware.json.gz",
    "meta-District_of_Columbia.json.gz",
    "meta-Montana.json.gz",
    "meta-South_Dakota.json.gz",
    "meta-Vermont.json.gz",
    "meta-Wyoming.json.gz",
    "meta-Rhode_Island.json.gz",
    "meta-North_Dakota.json.gz",
    "meta-West_Virginia.json.gz",
]

review_files = [
    "review-Alaska_10.json.gz",
    "review-Delaware_10.json.gz",
    "review-District_of_Columbia_10.json.gz",
    "review-Montana_10.json.gz",
    "review-South_Dakota_10.json.gz",
    "review-Vermont_10.json.gz",
    "review-Wyoming_10.json.gz",
    "review-Rhode_Island_10.json.gz",
    "review-North_Dakota_10.json.gz",
    "review-West_Virginia_10.json.gz",
]

In [13]:
K_PER_FILE = 5000
OUT_CSV = "reviews_with_meta_perfile2000.csv"
OUT_PARQUET = "reviews_with_meta_perfile2000.parquet"

# sampling 2000 rows per file
review_rows = []
for idx, f in enumerate(review_files):
    sampled = reservoir_sample_jsonl_gz(f, K_PER_FILE, seed=42 + idx)
    print(f"{f}: sampled {len(sampled)}")

    for rec in sampled:
        review_rows.append({
            "user_id": rec.get("user_id"),
            "name_review_user": rec.get("name"),
            "time": rec.get("time"),
            "rating": rec.get("rating"),
            "text": rec.get("text"),
            "pics": rec.get("pics"),
            "resp": rec.get("resp"),
            "gmap_id": rec.get("gmap_id"),
        })

df_reviews = pd.DataFrame(review_rows)
print("Collected reviews:", len(df_reviews))
print(df_reviews.head())



review-Alaska_10.json.gz: sampled 5000
review-Delaware_10.json.gz: sampled 5000
review-District_of_Columbia_10.json.gz: sampled 5000
review-Montana_10.json.gz: sampled 5000
review-South_Dakota_10.json.gz: sampled 5000
review-Vermont_10.json.gz: sampled 5000
review-Wyoming_10.json.gz: sampled 5000
review-Rhode_Island_10.json.gz: sampled 5000
review-North_Dakota_10.json.gz: sampled 5000
review-West_Virginia_10.json.gz: sampled 5000
Collected reviews: 50000
                 user_id   name_review_user           time  rating  \
0  108058074716948776283      Qiana Coleman  1539488755496       4   
1  104949021026115642355  Marina Harrington  1545440310213       5   
2  107813727678662505612       Felicia Wall  1575939675467       5   
3  103473424444871906765       Garry Hanley  1602702499311       5   
4  104393976961042472712    Nathan Harrison  1559438084515       4   

                                                text  pics  resp  \
0                                               None

In [14]:
meta_rows = []
for f in meta_files:
    with gzip.open(f, "rt", encoding="utf-8") as g:
        for line in g:
            if not line.strip():
                continue
            rec = json.loads(line)
            meta_rows.append(rec)

df_meta = pd.DataFrame(meta_rows)
print("Collected meta rows:", len(df_meta))

df_reviews["gmap_id"] = df_reviews["gmap_id"].astype(str)
if "gmap_id" not in df_meta.columns:
    raise KeyError("gmap_id not found in meta files")
df_meta["gmap_id"] = df_meta["gmap_id"].astype(str)
df_meta = df_meta.drop_duplicates(subset=["gmap_id"])

df_joined = df_reviews.merge(df_meta, on="gmap_id", how="left", validate="m:1")
print("Joined rows:", len(df_joined))
print(df_joined.head())



Collected meta rows: 149325
Joined rows: 50000
                 user_id   name_review_user           time  rating  \
0  108058074716948776283      Qiana Coleman  1539488755496       4   
1  104949021026115642355  Marina Harrington  1545440310213       5   
2  107813727678662505612       Felicia Wall  1575939675467       5   
3  103473424444871906765       Garry Hanley  1602702499311       5   
4  104393976961042472712    Nathan Harrison  1559438084515       4   

                                                text  pics  resp  \
0                                               None  None  None   
1                                               None  None  None   
2              Coffee is great and so is the service  None  None   
3  Easy in and out experience.  Reasonable pricin...  None  None   
4  Great lunch, service could be better as there ...  None  None   

                                 gmap_id                               name  \
0  0x56c897d4e9ac007f:0x1cbb3cff83d5e64b    

inspecting and cleaning

In [15]:
df = df_joined.copy()

print("\n=== HEAD ===")
print(df.head(3))
print("\n=== SHAPE ===", df.shape)
print("\n=== COLUMNS & DTYPES ===")
print(df.dtypes)



=== HEAD ===
                 user_id   name_review_user           time  rating  \
0  108058074716948776283      Qiana Coleman  1539488755496       4   
1  104949021026115642355  Marina Harrington  1545440310213       5   
2  107813727678662505612       Felicia Wall  1575939675467       5   

                                    text  pics  resp  \
0                                   None  None  None   
1                                   None  None  None   
2  Coffee is great and so is the service  None  None   

                                 gmap_id                               name  \
0  0x56c897d4e9ac007f:0x1cbb3cff83d5e64b        Fire Island Rustic Bakeshop   
1  0x56c8eb6e0f938ebf:0x7da5bf485f8a1e71  Jitters... Where Coffee Is An Art   
2  0x56c67c7da439c267:0x72f12aacd98f7f2e              Coffee Express Lounge   

                                             address  ...   longitude  \
0  Fire Island Rustic Bakeshop, 1343 G St, Anchor...  ... -149.895277   
1  Jitters... Whe

In [16]:
null_counts = df.isna().sum().sort_values(ascending=False)
print("\n=== NULL COUNTS ===")
print(null_counts.head(22))


=== NULL COUNTS ===
pics                48582
resp                45039
text                23378
description         22294
price               22026
state               21440
hours                5441
MISC                 2426
relative_results     2096
address                26
category                9
user_id                 0
avg_rating              0
num_of_reviews          0
latitude                0
longitude               0
name_review_user        0
name                    0
gmap_id                 0
rating                  0
time                    0
url                     0
dtype: int64


In [17]:
df = df.dropna(subset=["text"])
null_counts = df.isna().sum().sort_values(ascending=False)
print("\n=== NULL COUNTS ===")
print(null_counts.head(22))


=== NULL COUNTS ===
pics                25346
resp                23559
description         12050
price               11875
state               11139
hours                2959
MISC                 1497
relative_results     1325
address                15
category                6
user_id                 0
num_of_reviews          0
avg_rating              0
latitude                0
longitude               0
name_review_user        0
name                    0
gmap_id                 0
text                    0
rating                  0
time                    0
url                     0
dtype: int64


In [None]:
cols_to_drop = ["description",
                "name",
                "hours",
                "MISC",
                "relative_results",
                "url",
                "address",
                "pics"]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])
print(df.info())

TypeError: unhashable type: 'list'