In [1]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output

review_file_path = '../input/yelp-dataset/yelp_academic_dataset_review.json'
business_file_path = '../input/yelp-dataset/yelp_academic_dataset_business.json'

# Function to count keyword occurrences
def count_keywords(text, keywords):
    count = 0
    for word in keywords:
        count += text.lower().count(word)
    return count

# Classify reviews based on counts
def classify(row):
    if row['food_count'] > row['service_count']:
        return 'Food'
    elif row['food_count'] < row['service_count']:
        return 'Service'
    else:
        return 'Neutral'

# Create dropdown widget for selecting business
dropdown_business = widgets.Dropdown(description='Business:')
# Create dropdown widget for selecting category
dropdown_category = widgets.Dropdown(options=['All', 'Food', 'Service', 'Neutral'], description='Category:')
# Text widget for displaying reviews
review_text = widgets.Textarea(description='Review Text:', disabled=True)

# Function to update review text
def update_review(change):
    business_name = dropdown_business.value
    reviews_subset = merged_data[merged_data['name'] == business_name]
    if dropdown_category.value != 'All':
        reviews_subset = reviews_subset[reviews_subset['category'] == dropdown_category.value]
    review_text.value = '\n\n'.join(reviews_subset['text'])

# Link dropdowns and update function
dropdown_business.observe(update_review, names='value')
dropdown_category.observe(update_review, names='value')

# Display widgets
display(dropdown_business)
display(dropdown_category)
display(review_text)

# Read review data in chunks
chunksize = 100000
reviews_chunks = pd.read_json(review_file_path, lines=True, chunksize=chunksize)

# Read business data
business = pd.read_json(business_file_path, lines=True)

# Filter businesses that are categorized as restaurants
df_yelp_business = business.copy()
df_yelp_business_restaurants = df_yelp_business.loc[(df_yelp_business['categories'].str.contains('Restaurants', na=False))]

for reviews in reviews_chunks:
    # Merge review data with filtered restaurant business data
    merged_data = pd.merge(reviews, df_yelp_business_restaurants[['business_id', 'name']], on='business_id', how='inner')

    # Define keywords related to food and service
    food_keywords = ['sick', 'food poisoning', 'hair', 'gross', 'portion', 'taste', 'dry', 'fresh', 'tasted', 'food', 'seasoned', 'bland', 'stale', 'tasty', 'tasted', 'texture', 'cooked', 'bite', 'crispy', 'soggy', 'raw', 'delicious']
    service_keywords = ['service', 'waiter', 'waitress', 'staff', 'hospitality','employee', 'driver', 'cook', 'manager', 'pissed', 'disrespectful', 'treated', 'experience', 'team', 'operating', 'operation', 'waitress', 'waiter', 'customer service', 'greet', 'apology', 'rude', 'condescending', 'angry', 'atmosphere', 'vibe', 'request' , 'requested', 'service', 'slow', 'minutes', 'hours', 'arrived', 'closing', 'close', 'closed', 'wait', 'waiting', 'price', 'pricing', 'prices', 'server', 'loud', 'environment', 'staff', 'friendly', 'place', 'location', 'wave', 'ambience', 'forgot', 'forgetting', 'wrong', 'establishment', 'pricey', 'owner', 'interior', 'clean', 'comfortable', 'greeted', 'attentive']

    # Apply the function to each review chunk
    merged_data['food_count'] = merged_data['text'].apply(lambda x: count_keywords(x, food_keywords))
    merged_data['service_count'] = merged_data['text'].apply(lambda x: count_keywords(x, service_keywords))

    # Classify reviews based on counts
    merged_data['category'] = merged_data.apply(classify, axis=1)

    # Update dropdown options
    dropdown_business.options = merged_data['name'].unique()


Dropdown(description='Business:', options=(), value=None)

Dropdown(description='Category:', options=('All', 'Food', 'Service', 'Neutral'), value='All')

Textarea(value='', description='Review Text:', disabled=True)