In [89]:
import json
import csv
from cuisines import cuisines

In [62]:
def get_contents(contents, columns):
    return map(lambda x: str(contents[x]), columns)

In [65]:
# Get users with more than 20 reviews
users_over_20 = set()

user_columns = ['user_id', 'name', 'review_count']
with open('users_over_20.csv', 'w', encoding='utf-8') as fout:
    csv_file = csv.writer(fout)
    csv_file.writerow(user_columns)
    with open('yelp_academic_dataset_user.json', 'rb') as fin:
        for line in fin:
            contents = json.loads(line)
            if contents['review_count'] >= 20:
                users_over_20.add(contents['user_id'])
                csv_file.writerow(get_contents(contents, user_columns))
print (f"{len(users_over_20)}")

312462


In [70]:
def clean_json(s):
    return s.replace('False', '"False"').replace('True', '"True"').replace("'", '"')

In [76]:
# Get business names and categories
businesses_lv = set()
with_cuisine = 0
attributes = {'GoodForMeal': set(), 'RestaurantsPriceRange2': set(), 'NoiseLevel': set(), 'RestaurantsAttire': set()}
total = 0
business_columns = ['business_id', 'name', 'latitude', 'longitude', 'attributes', 'categories']
with open('businesses.csv', 'w', encoding='utf-8') as fout, open('businesses_lv.csv', 'w', encoding='utf-8') as fout2:
    csv_file = csv.writer(fout)
    csv_file.writerow(business_columns)
    csv_file_lv = csv.writer(fout2)
    csv_file_lv.writerow(business_columns)
    with open('yelp_academic_dataset_business.json', 'rb') as fin:
        for line in fin:    
            contents = json.loads(line)
            # May exclude bars, but captures most restaurants / cafes and excludes other categories
            if contents['categories'] and ('Restaurants' in contents['categories'] or 'Food' in contents['categories']):
                if contents['city'].lower() == 'las vegas':
                    businesses_lv.add(contents['business_id'])
                    csv_file_lv.writerow(get_contents(contents, business_columns))
                csv_file.writerow(get_contents(contents, business_columns))
                total += 1
# RestaurantsPriceRange2, GoodForMeal, NoiseLevel, RestaurantsAttire
print (f"Total businesses: {total}, Total in LV: {len(businesses_lv)}")

Total businesses: 72651, Total in LV: 7981


In [66]:
# Get reviews for relevant users and businesses
reviews_in_lv = 0
review_columns = ['user_id', 'business_id', 'stars']
with open('reviews.csv', 'w', encoding='utf-8') as fout, open('reviews_lv.csv', 'w', encoding='utf-8') as fout2:
    csv_file = csv.writer(fout)
    csv_file.writerow(review_columns)
    csv_file_lv = csv.writer(fout2)
    csv_file_lv.writerow(review_columns)
    with open('yelp_academic_dataset_review.json', 'rb') as fin:
        for line in fin:
            contents = json.loads(line)
            if contents['user_id'] in users_over_20:
                if contents['business_id'] in businesses_lv:
                    csv_file_lv.writerow(get_contents(contents, review_columns))
                    reviews_in_lv += 1
                csv_file.writerow(get_contents(contents, review_columns))
print (f"Reviews for LV: {reviews_in_lv}")

Reviews for LV: 709671


In [85]:
def parse_values(s):
    d = json.loads(clean_json(s))
    vals = ''
    for key in sorted(d):
        if d[key] == 'True':
            vals += '1'
        else:
            vals += '0'
    return vals

def noise_level(n):
    if n == 'quiet':
        return '1'
    if n == 'average':
        return '2'
    if n == 'loud':
        return '3'
    if n == 'very_loud':
        return '4'
    return '0'

In [88]:
# Get business data with features: star rating, cuisine, price range, noise level, good for kids/groups, ambience
business_columns = ['business_id', 'name', 'stars', 'latitude', 'longitude', 'attributes', 'categories']
features = ['cuisines', 'price_range', 'noise_level', 'good_for_kids', 'good_for_groups', 'ambience']
x = {'Ambience': set(), 'NoiseLevel': set()}
with open('businesses_features.csv', 'w', encoding='utf-8') as fout:
    csv_file = csv.writer(fout)
    csv_file.writerow(business_columns + features)
    with open('yelp_academic_dataset_business.json', 'rb') as fin:
        for line in fin:    
            contents = json.loads(line)
            # May exclude bars, but captures most restaurants / cafes and excludes other categories
            if contents['categories'] and ('Restaurants' in contents['categories'] or 'Food' in contents['categories']):
                if contents['city'].lower() == 'las vegas':
                    row = list(get_contents(contents, business_columns))
                    bc = []
                    for c in contents['categories'].split(', '):
                        if c in cuisines:
                            bc.append(c)
                    row += [','.join(bc)]
                    if contents['attributes']:
                        attr = contents['attributes']
                        row.append(attr['RestaurantsPriceRange2']) if 'RestaurantsPriceRange2' in attr else row.append('0')
                        row.append(noise_level(attr['NoiseLevel'])) if 'NoiseLevel' in attr else row.append('0')
                        row.append(1) if 'GoodForKids' in attr and attr['GoodForKids'] else row.append('0')
                        row.append(1) if 'RestaurantsGoodForGroups' in attr and attr['RestaurantsGoodForGroups'] else row.append('0')
                        row.append(parse_values(attr['Ambience'])) if 'Ambience' in attr else row.append('0' * 9) # 9 possible values
                    else:
                        row += ['0' for _ in range(4)]
                        row.append('0' * 9)
                    csv_file.writerow(row)

print (f"Total businesses: {total}, Total in LV: {len(businesses_lv)}")

Total businesses: 72651, Total in LV: 7981
