In [None]:
import pandas as pd

chunk_size = 100000
chunks = pd.read_json('data/yelp_reviews.json', lines=True, chunksize=chunk_size)
sample_data = pd.DataFrame()
target_reviews = 200000 
for chunk in chunks:
    sample_data = pd.concat([sample_data, chunk])
    if len(sample_data) >= target_reviews:
        sample_data = sample_data.iloc[:target_reviews]
        break

sample_data.to_json('data/yelp_reviews_sample.json', orient='records', lines=True)
print("Sampled reviews:", len(sample_data))
print("File size (MB):", sample_data.memory_usage(deep=True).sum() / (1024 * 1024))

Sampled reviews: 200000
File size (MB): 172.058762550354


In [None]:
import pandas as pd
sample_data = pd.read_json('data/yelp_reviews_sample.json', lines=True)
print("First 5 rows:")
print(sample_data.head())
print("Columns:", sample_data.columns)
print("Number of reviews:", len(sample_data))

First 5 rows:
                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   

   stars  useful  funny  cool  \
0      3       0      0     0   
1      5       1      0     1   
2      3       0      0     0   
3      5       1      0     1   
4      4       1      0     1   

                                                text                date  
0  If you decide to eat here, just be aware it is... 2018-07-07 22:09:11  
1  I've taken a lot of spin classes over the year... 2012-01-03 15:28:18  
2  Family diner. Had the buffet. Eclectic assortm... 2014-02-05 20:30:30  
3  Wow!  Yummy, differen

In [None]:
import pandas as pd

try:
    business = pd.read_json('data/yelp_business.json', lines=True)
except UnicodeDecodeError:
    try:
        business = pd.read_json('data/yelp_business.json', lines=True, encoding='latin1')
    except UnicodeDecodeError:
        
        business = pd.read_json('data/yelp_business.json', lines=True, encoding='iso-8859-1')

if 'business' not in locals():
    try:
        business = pd.read_json('data/yelp_business.json', encoding='utf-8')
    except UnicodeDecodeError:
        business = pd.read_json('data/yelp_business.json', encoding='latin1')


print("First 5 rows of business data:")
print(business.head())
print("Columns:", business.columns)


print("\nUnique categories (first 100):")
print(business['categories'].dropna().head(100))
print("\nNumber of unique business_ids in business JSON:", business['business_id'].nunique())

First 5 rows of business data:
              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open  \
0  34.426679 -119.711197    5.0             7        0   
1  38.551126  -90.335695    3.0            15        1   
2  32.223236 -110.880452    3.5      

In [None]:

reviews = pd.read_json('data/yelp_reviews_sample.json', lines=True)

print("Number of unique business_ids in reviews:", reviews['business_id'].nunique())

overlapping_ids = set(reviews['business_id']).intersection(set(business['business_id']))
print(f"Number of overlapping business_ids: {len(overlapping_ids)}")

if len(overlapping_ids) == 0:
    print("No overlapping business_ids found. Check if the datasets are compatible.")
else:
    def is_restaurant(categories):
        if isinstance(categories, str):
            return 'restaurants' in categories.lower()
        elif isinstance(categories, list):
            return any('restaurants' in cat.lower() for cat in categories)
        return False

    restaurants = business[business['categories'].apply(is_restaurant)]
    restaurant_ids = restaurants['business_id'].tolist()

    restaurant_reviews = reviews[reviews['business_id'].isin(restaurant_ids)]

    restaurant_reviews.to_json('data/yelp_restaurant_reviews.json', orient='records', lines=True)
    print(f"Total restaurant reviews: {len(restaurant_reviews)}")
    print("First 5 rows of filtered reviews:")
    print(restaurant_reviews.head())

Number of unique business_ids in reviews: 11451
Number of overlapping business_ids: 11451
Total restaurant reviews: 144424
First 5 rows of filtered reviews:
                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   
5  JrIxlS1TzJ-iCu79ul40cQ  eUta8W_HdHMXPzLBBZhL1A  04UD14gamNjLY0IDYVhHJg   

   stars  useful  funny  cool  \
0      3       0      0     0   
2      3       0      0     0   
3      5       1      0     1   
4      4       1      0     1   
5      1       1      2     1   

                                                text                date  
0  If you decide to eat here, just be aware it is... 2018-07-07 22:09:11  
2  Family diner. Had the buffet