**INSTRUCTIONS**

Run each cell in the order indicated. Cells labeled "optional" do not contribute to subsequent steps.

*Disclaimer:* The steps do not match the README.md in the repository due to the nature of coding in a Jupyter Notebook.

In [None]:

"""
STEP 1a: Pulling data directly from Kaggle

If desired, pull data directly from the Kaggle API using a valid account. If referencing stored files is preferable, upload both data files to the Google Drive of the Google Account signed
into this runtime and proceed to step 1b.

"""
%%writefile kaggle.json
{
  "username": "pallavimamilla",
  "key": "KaggleAPIKey"
}

Writing kaggle.json


In [None]:
"""
Step 1a continued
"""

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets list | head

ref                                                            title                                               size  lastUpdated                 downloadCount  voteCount  usabilityRating  
-------------------------------------------------------------  --------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
saidaminsaidaxmadov/chocolate-sales                            Chocolate Sales                                   468320  2026-01-04 14:23:35.490000          15422        277  1.0              
aliiihussain/amazon-sales-dataset                              Amazon_Sales_Dataset                             1297759  2026-02-01 11:37:12.353000           2634         52  1.0              
sanaijlalshahrukh/gold-price-analysis-10-year-historical-data  Gold Price analysis 10-Year Historical Data        41715  2026-02-05 03:45:30.177000            745         23  1.0              
ayeshaimran1619/student-academic-st

In [None]:
"""
Step 1a continued
"""
!kaggle datasets download -d yelp-dataset/yelp-dataset
!unzip yelp-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset
License(s): other
Downloading yelp-dataset.zip to /content
100% 4.06G/4.07G [02:08<00:00, 216MB/s]
100% 4.07G/4.07G [02:08<00:00, 33.9MB/s]
Archive:  yelp-dataset.zip
  inflating: Dataset_User_Agreement.pdf  
  inflating: yelp_academic_dataset_business.json  
  inflating: yelp_academic_dataset_checkin.json  
  inflating: yelp_academic_dataset_review.json  
  inflating: yelp_academic_dataset_tip.json  
  inflating: yelp_academic_dataset_user.json  


In [1]:
"""
Step 1b: Loading data from a mounted Google Drive

Prerequisite: upload yelp_academic_dataset_business.json and yelp_academic_dataset_review.json to the Google Drive of the Google Account signed into this runtime.

Follow the prompts to allow access to the Google Drive.
"""

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [2]:
"""
Step 2: Load common dependencies
"""

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import json

In [3]:
"""
Step 3: Load yelp_academic_dataset_business.json into a DataFrame
"""

business_data = [] #temporary matrix to which each JSON line will be appended
with open("yelp_academic_dataset_business.json", "r", encoding="utf-8") as file: #open the JSON file with standard encoding
    for line in file: #for each line in the JSON lines file, add each line to the matrix
        business_data.append(json.loads(line))

business_df = pd.DataFrame(business_data) #convert temporary matrix to necessary DataFrame structure
business_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [4]:
"""
Step 4: Load yelp_academic_dataset_review.json into a DataFrame
"""

review_data = []  #temporary matrix to which each JSON line will be appended
with open("yelp_academic_dataset_review.json", "r", encoding="utf-8") as file:  #open the JSON file with standard encoding
    for i, line in enumerate(file): #for each line in the JSON lines file, add each line to the matrix
        review_data.append(json.loads(line))
        if i >= 200000: #limit appending to only the first 200,000 records of the file
            break

review_df = pd.DataFrame(review_data) #convert temporary matrix to necessary DataFrame structure
review_df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
199996,4ylitTY7GmonnQDqhjB-eg,_fMw99otXOCu-Kzc-loMZA,v72HalBu5fQIFTD-oqhSgQ,5.0,0,0,0,"Let me just say, I'm glad my husband and I dec...",2016-08-18 00:23:31
199997,MnwNSr5h_t1uIlcj3aI_5w,ZfG1T_noQnbOdXnz4USs0Q,cvHY1RTAPSoH94ysP2_eyQ,5.0,1,1,1,I was coming back from the Farmers market toda...,2014-06-21 17:12:53
199998,3fp0uXGAjIqgza02_ObF5w,I0MMq4iYjDjERnSjp76HDw,cBp1fn2LXpSHe4VbTXLYEw,5.0,0,0,1,Absolutely perfect meal for me. The food is cl...,2017-09-06 17:54:23
199999,cSlJEcygfh-57otb6dF3Lg,sokS5Ml2QyKjOwO56pS9Jw,oBhJuukGRqPVvYBfTkhuZA,5.0,0,0,0,Braised beef Short Rib! Asparagus on the side....,2015-06-16 01:51:18


In [5]:
"""
Step 5: Filter the business dataset to only include records with "Restaurants" in the cuisine column

Not interested in any business which is not considered a restaurant.
"""

restaurant_df = business_df[
    business_df['categories'].notna() &
    business_df['categories'].str.contains('Restaurants')
].copy()

restaurant_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...","Pubs, Restaurants, Italian, Bars, American (Tr...",
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."


In [7]:
"""
Step 6: Inner join both the business and reviews data sets on their business_id columns

Essential to relate each review with the cuisine type of the corresponding business.

"""

restaurant_reviews = pd.merge(
    review_df,
    restaurant_df[['business_id', 'name', 'categories']], #limit business information, as reviews are the primary concern
    on='business_id',
    how='inner'
)


restaurant_reviews.sort_values(by="business_id").head(20) #check that businesses have multiple reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,name,categories
12425,smRZuMTLFML_GImDsPmQVw,PlsB30q5bR2raPMpuN1Wuw,--ZVrH2X2QXBFdCilbirsw,5.0,0,0,0,One of the best hoagies I've ever had. The pro...,2014-12-18 18:14:15,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."
14017,xgAqKM3xkeTCtXC78gdnnQ,Gv9KfGk5j69oE9LugsLKiw,--ZVrH2X2QXBFdCilbirsw,5.0,0,1,2,Moving into our new house and I think the Ital...,2014-03-15 01:57:33,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."
61229,kE-NdGwUA1zbqNp9MgDing,Yy8JcvtMoNajJJW7k-y4MA,--ZVrH2X2QXBFdCilbirsw,5.0,1,0,0,These are the best hoagies and pizza in Ardmor...,2017-07-31 16:56:21,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."
90721,1OO2oZ33wxPFH66e6riY-Q,Tf-TSPR3nqA_7b7_4KqWnQ,--ZVrH2X2QXBFdCilbirsw,5.0,0,0,0,Great sandwichs. Order 7 different ones so far...,2017-02-23 18:27:28,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."
4291,7snN_xdG7dtX4u1bdoNq0w,7iCjHZY74yCEF-Eajx5sIA,--ZVrH2X2QXBFdCilbirsw,5.0,0,0,0,This place is sadly perm closed. I was hoping ...,2018-02-24 00:53:41,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."
140890,yFpYsKyNaj6Uha_9_B2dYQ,Cv7mEO8-Jfy6JZiMJmenSg,--ZVrH2X2QXBFdCilbirsw,5.0,0,0,0,Favorite Italian hoagie in the area. Other san...,2017-07-03 13:21:06,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."
26348,HNMJqBFBGm6tFHxU70gOxQ,wuCvIRDk-m0ZpJNH07X-Sw,--ZVrH2X2QXBFdCilbirsw,5.0,0,0,0,The classic Italian hoagie is fantastic and a ...,2016-02-02 19:59:42,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."
65724,GvktZ263lYMC0qq9q5vDjg,w9gQB_NFEJI_JR_l-RvIGQ,--ZVrH2X2QXBFdCilbirsw,5.0,1,0,0,The reviews for this place were very accurate....,2015-05-14 12:42:59,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."
83411,JxF41Y9WHQJINIhffOlPGQ,KfDyITIWtA8h3dr_4FB3UQ,--ZVrH2X2QXBFdCilbirsw,5.0,1,0,0,Perfect hoagies and all around great deli. Don...,2015-01-09 21:57:01,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."
132909,byiWcE7j1-k7sCfkifu10Q,eJWLU7yYDHQPNobP5C2FdQ,--ZVrH2X2QXBFdCilbirsw,5.0,1,1,0,There are places in life that just make you ha...,2015-08-11 04:08:39,Chris's Sandwich Shop,"American (Traditional), Restaurants, Pizza, Sa..."


In [10]:
"""
Step 7: Define the cuisine type keywords (i.e. FOOD_KEYWORDS) and extract the keywords from the categories column for each record

"""
#modify list to include desired keywords (keep as is to match results)
FOOD_KEYWORDS = [
    'American', 'Mexican', 'Italian', 'Chinese', 'Japanese', 'Thai',
    'Vietnamese', 'Korean', 'Mediterranean', 'Greek', 'French', 'Spanish',
    'Breakfast', 'Brunch', 'Pizza', 'Burgers', 'Seafood', 'BBQ', 'Sushi',
    'Ramen', 'Cafes', 'Coffee', 'Bakeries', 'Desserts', 'Fast Food',
    'Vegetarian', 'Vegan', 'Halal', 'Kosher'
]

def extract_cuisines(categories):
  """
  categories: list of strings containg the keywords
  """
  output = [] #initial empty list
  for keyword in FOOD_KEYWORDS: #for every keyword
    if keyword in categories: #check if that keyword is found in the categories column
      output.append(keyword) #if keyword found, append that keyword to the output list
  return output
restaurant_reviews['cuisine'] = restaurant_reviews['categories'].apply(extract_cuisines) #creates a column that stores a list of each keyword found in the categories column
restaurant_reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,name,categories,cuisine
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,"Restaurants, Breakfast & Brunch, Food, Juice B...","[American, Breakfast, Brunch, Coffee]"
1,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,Kettle Restaurant,"Restaurants, Breakfast & Brunch","[Breakfast, Brunch]"
2,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Zaika,"Halal, Pakistani, Restaurants, Indian",[Halal]
3,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Melt,"Sandwiches, Beer, Wine & Spirits, Bars, Food, ...",[American]
4,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,Dmitri's,"Mediterranean, Restaurants, Seafood, Greek","[Mediterranean, Greek, Seafood]"
...,...,...,...,...,...,...,...,...,...,...,...,...
144420,4ylitTY7GmonnQDqhjB-eg,_fMw99otXOCu-Kzc-loMZA,v72HalBu5fQIFTD-oqhSgQ,5.0,0,0,0,"Let me just say, I'm glad my husband and I dec...",2016-08-18 00:23:31,1200 Chophouse,"Steakhouses, American (New), Restaurants",[American]
144421,MnwNSr5h_t1uIlcj3aI_5w,ZfG1T_noQnbOdXnz4USs0Q,cvHY1RTAPSoH94ysP2_eyQ,5.0,1,1,1,I was coming back from the Farmers market toda...,2014-06-21 17:12:53,The Turnip Truck Urban Fare,"Health Markets, Restaurants, Grocery, Sandwich...",[]
144422,3fp0uXGAjIqgza02_ObF5w,I0MMq4iYjDjERnSjp76HDw,cBp1fn2LXpSHe4VbTXLYEw,5.0,0,0,1,Absolutely perfect meal for me. The food is cl...,2017-09-06 17:54:23,Bab Café - Reno,"Restaurants, Cafes, Korean, Gluten-Free, Comfo...","[Korean, Cafes, Vegetarian, Vegan]"
144423,cSlJEcygfh-57otb6dF3Lg,sokS5Ml2QyKjOwO56pS9Jw,oBhJuukGRqPVvYBfTkhuZA,5.0,0,0,0,Braised beef Short Rib! Asparagus on the side....,2015-06-16 01:51:18,Square 1682,"American (New), Breakfast & Brunch, Bars, Nigh...","[American, Breakfast, Brunch]"


In [9]:
"""
Step 8: Install language detection dependencies using pip
"""

!pip install langdetect googletrans-py

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting googletrans-py
  Downloading googletrans-py-4.0.0.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect, googletrans-py
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=677637a3d07c04388f290a87abf3c3f5cb544916807163ca14195235894d5063
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
  Building wheel for googletrans-py (setup.py) ... [?25l[?25hdone
  Created wheel for googletrans-py: filena

In [11]:
"""
Step 9: Define language detectin function and load relevant dependencies
"""

from googletrans import Translator
from langdetect import detect, LangDetectException

def translate_text_to_english(text):
    """
    text: any string input
    """
    if not isinstance(text, str): # if the input is not actually a stirng, return the input
        return text
    text = text.strip() #remove all spaces from the text
    if not text: #if empty, return the empty string
        return text

    try:
        detected_lang = detect(text) #try and detect a language
    except LangDetectException:
        # iff language detection fails, assume English or return original
        return text

    if detected_lang != 'en': #if the detected language is not English
        try:
            #try to instantiate a translator and translate the text to English
            translator = Translator()
            translated = translator.translate(text, dest='en')
            return translated.text
        except Exception as e:
            #if translation fails, print an error message and
            print(f"Translation error for text: {text[:50]}... Error: {e}")
            return text  # return original text on translation error
    else:
        #if the detected language is English, just return the original input
        return text

In [13]:
"""
Step 10: Use defined translation function to translate the text columns (i.e. the review text) for each record

"""

restaurant_reviews['translated_text'] = restaurant_reviews['text'].apply(translate_text_to_english)
restaurant_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,name,categories,cuisine,translated_text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,"Restaurants, Breakfast & Brunch, Food, Juice B...","[American, Breakfast, Brunch, Coffee]","If you decide to eat here, just be aware it is..."
1,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,Kettle Restaurant,"Restaurants, Breakfast & Brunch","[Breakfast, Brunch]",Family diner. Had the buffet. Eclectic assortm...
2,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Zaika,"Halal, Pakistani, Restaurants, Indian",[Halal],"Wow! Yummy, different, delicious. Our favo..."
3,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Melt,"Sandwiches, Beer, Wine & Spirits, Bars, Food, ...",[American],Cute interior and owner (?) gave us tour of up...
4,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,Dmitri's,"Mediterranean, Restaurants, Seafood, Greek","[Mediterranean, Greek, Seafood]",I am a long term frequent customer of this est...


In [14]:
"""
Step 11: Install vaderSentiment dependency
"""

!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [15]:
"""
Step 12: load dependency and apply sentiment analyzer to each record's English review text
"""

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer() #instantiate vaderSentiment analyzer

#apply annalyzer to the English review text for each record
restaurant_reviews['sentiment'] = restaurant_reviews['translated_text'].apply(
    lambda x: analyzer.polarity_scores(x)['compound']
)

In [16]:
"""
Step 13: scale star rating to better match sentiment score output and create columns representing 1. the difference between the sentiment score and the scaled star rating and 2. the absolute value of that difference
"""

restaurant_reviews['stars_norm'] = (restaurant_reviews['stars'] - 1) / 4 #scale stars column

#initial diffrence between sentiment score and scaled stars
restaurant_reviews['mismatch'] = np.abs(
    restaurant_reviews['sentiment'] - restaurant_reviews['stars_norm']
)

#absolute value of that difference
restaurant_reviews['direction'] = (
    restaurant_reviews['sentiment'] - restaurant_reviews['stars_norm']
)

In [17]:
"""
Step 14: drop records without an assigned cuisine type and explode data based on cuisine column
"""

#explode the translated_restaurant_reviews table based on the cuisine column
exploded_restaurant_reviews = restaurant_reviews.explode('cuisine')

#drop any row where a cuisine is not specified
exploded_restaurant_reviews = exploded_restaurant_reviews[exploded_restaurant_reviews['cuisine'].str.len() > 0]

#display results
exploded_restaurant_reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,name,categories,cuisine,translated_text,sentiment,stars_norm,mismatch,direction
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,"Restaurants, Breakfast & Brunch, Food, Juice B...",American,"If you decide to eat here, just be aware it is...",0.8597,0.5,0.3597,0.3597
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,"Restaurants, Breakfast & Brunch, Food, Juice B...",Breakfast,"If you decide to eat here, just be aware it is...",0.8597,0.5,0.3597,0.3597
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,"Restaurants, Breakfast & Brunch, Food, Juice B...",Brunch,"If you decide to eat here, just be aware it is...",0.8597,0.5,0.3597,0.3597
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,"Restaurants, Breakfast & Brunch, Food, Juice B...",Coffee,"If you decide to eat here, just be aware it is...",0.8597,0.5,0.3597,0.3597
1,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,Kettle Restaurant,"Restaurants, Breakfast & Brunch",Breakfast,Family diner. Had the buffet. Eclectic assortm...,0.9201,0.5,0.4201,0.4201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144423,cSlJEcygfh-57otb6dF3Lg,sokS5Ml2QyKjOwO56pS9Jw,oBhJuukGRqPVvYBfTkhuZA,5.0,0,0,0,Braised beef Short Rib! Asparagus on the side....,2015-06-16 01:51:18,Square 1682,"American (New), Breakfast & Brunch, Bars, Nigh...",American,Braised beef Short Rib! Asparagus on the side....,0.9811,1.0,0.0189,-0.0189
144423,cSlJEcygfh-57otb6dF3Lg,sokS5Ml2QyKjOwO56pS9Jw,oBhJuukGRqPVvYBfTkhuZA,5.0,0,0,0,Braised beef Short Rib! Asparagus on the side....,2015-06-16 01:51:18,Square 1682,"American (New), Breakfast & Brunch, Bars, Nigh...",Breakfast,Braised beef Short Rib! Asparagus on the side....,0.9811,1.0,0.0189,-0.0189
144423,cSlJEcygfh-57otb6dF3Lg,sokS5Ml2QyKjOwO56pS9Jw,oBhJuukGRqPVvYBfTkhuZA,5.0,0,0,0,Braised beef Short Rib! Asparagus on the side....,2015-06-16 01:51:18,Square 1682,"American (New), Breakfast & Brunch, Bars, Nigh...",Brunch,Braised beef Short Rib! Asparagus on the side....,0.9811,1.0,0.0189,-0.0189
144424,1IEzEH_J7TH21fKFdfxlNQ,345cM-IMsRDwutYG-AkJNw,LHSTtnW3YHCeUkRDGyJOyw,5.0,6,0,3,"Great craft beers, great fresh food. Small me...",2015-04-12 00:20:02,Fries Rebellion,"Beer Bar, Bars, American (New), Gastropubs, Re...",American,"Great craft beers, great fresh food. Small me...",0.8718,1.0,0.1282,-0.1282


In [25]:

"""
Step 15: install necessary statistics dependencies
"""

#Install necessary statistics package
!pip install scipy



In [50]:
"""
Step 16: create functions to run necessary statistics; use functions to produce p-value for exploded data set
"""

#Run ANOVA using cuisine type on exploded data set
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

def anova(data, iv, dv, print_output=False):
  """
  data: source data frame
  iv: name of column which contains category labels
  dv: name of column which contains corresponding values
  print: whether to print the output as well
  """
  groups = [group[dv].values for name, group in data.groupby(iv)]
  f_stat, p_value = stats.f_oneway(*groups)
  if print_output:
    print("F-statistic:", f_stat)
    print("p-value:", p_value)
  return f_stat, p_value

#see anova function for parameters
def tukey_hsd(data, iv, dv, alpha=0.05):
    tukey = pairwise_tukeyhsd(
        endog=data[dv],
        groups=data[iv],
        alpha=alpha
    )
    print(tukey.summary())
    return tukey


exploded_anova_results = anova(exploded_restaurant_reviews,"cuisine","mismatch",True)
exploded_tukey_results = tukey_hsd(exploded_restaurant_reviews,"cuisine","mismatch")


F-statistic: 79.7291291843239
p-value: 0.0
       Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1        group2    meandiff p-adj   lower   upper  reject
------------------------------------------------------------------
     American      Bakeries  -0.0261    0.0 -0.0407 -0.0115   True
     American     Breakfast  -0.0188    0.0 -0.0265 -0.0111   True
     American        Brunch  -0.0188    0.0 -0.0265 -0.0111   True
     American       Burgers   0.0385    0.0  0.0286  0.0483   True
     American         Cafes  -0.0434    0.0 -0.0543 -0.0324   True
     American       Chinese   0.0276    0.0  0.0135  0.0417   True
     American        Coffee   -0.021    0.0  -0.031  -0.011   True
     American      Desserts   -0.004    1.0   -0.02  0.0119  False
     American     Fast Food    0.098    0.0  0.0834  0.1125   True
     American        French  -0.0387    0.0 -0.0567 -0.0208   True
     American         Greek  -0.0387    0.0 -0.0589 -0.0186   True
     American      

In [35]:
"""
Step 17: create and use functions to randomize the cuisine tag assignment instead of exploding the data through many iterations and take the average p-value
"""

import random

def random_anova(data, iv, dv):
  """
  data: source data frame
  iv: name of column which contains category labels
  dv: name of column which contains corresponding values
  """

  #same as anova function in step 16, but with random preprocessing of inputted data
  working = data[data[iv].str.len()>0].copy()
  working['random_iv'] = working[iv].apply(lambda x: random.choice(x) if isinstance(x,list) and x else None) #applies a random selection if the record's value at that column is present and is a list
  f_stat, p_value = anova(working,'random_iv',dv)
  return p_value

def bootstrap_anova(data,iv,dv,boots):
  """
  data: source data frame
  iv: name of column which contains category labels
  dv: name of column which contains corresponding values
  boots: how many simulations to run
  """
  p_values = []
  for n in range(boots):
    p_values.append(random_anova(data,iv,dv)) #append each simulation's p value to an output list
  print("Mean p-value: ", np.mean(p_values))

bootstrap_anova(restaurant_reviews,"cuisine","mismatch",100)


Mean p-value:  8.19750456922396e-151


In [46]:
"""
Step 19: find bottom five and top five average mismatches and direction
"""
bottom_five = pd.DataFrame(exploded_restaurant_reviews.groupby('cuisine')[['mismatch','direction']].mean().sort_values(by='mismatch')[0:5]).reset_index()
bottom_five.columns = ["cuisine", "mismatch","direction"]
top_five = pd.DataFrame(exploded_restaurant_reviews.groupby('cuisine')[['mismatch','direction']].mean().sort_values(by='mismatch',ascending=False)[0:5]).reset_index()
top_five.columns = ["cuisine", "mismatch","direction"]


(      cuisine  mismatch  direction
 0       Halal  0.198951  -0.033493
 1       Vegan  0.215999  -0.018499
 2       Cafes  0.224389  -0.010254
 3  Vegetarian  0.228424  -0.006202
 4      French  0.229000   0.039297,
      cuisine  mismatch  direction
 0  Fast Food  0.365726  -0.106912
 1    Burgers  0.306236  -0.014271
 2    Chinese  0.295392  -0.021936
 3      Pizza  0.283659  -0.014742
 4    Italian  0.278458  -0.001126)

In [49]:
"""
Step 20: after filtering on ethnic foods, find bottom five and top five average mismatches
"""

#choose ethnic keywords for further analysis
ETHNIC_CUISINE_KEYWORDS = ['American', 'Mexican', 'Italian', 'Chinese', 'Japanese', 'Thai',
    'Vietnamese', 'Korean', 'Mediterranean', 'Greek', 'French', 'Spanish', 'Halal', 'Kosher']

#repeat top five and bottom five
bottom_five_ethnic = pd.DataFrame(exploded_restaurant_reviews[exploded_restaurant_reviews['cuisine'].isin(ETHNIC_CUISINE_KEYWORDS)].groupby('cuisine')[['mismatch','direction']].mean().sort_values(by='mismatch')[0:5]).reset_index()
bottom_five_ethnic.columns = ["cuisine", "mismatch","direction"]
top_five_ethnic = pd.DataFrame(exploded_restaurant_reviews[exploded_restaurant_reviews['cuisine'].isin(ETHNIC_CUISINE_KEYWORDS)].groupby('cuisine')[['mismatch','direction']].mean().sort_values(by='mismatch',ascending=False)[0:5]).reset_index()
top_five_ethnic.columns = ["cuisine", "mismatch","direction"]



In [52]:
"""
Step 21: write to output files
"""
exploded_restaurant_reviews.to_csv("final_dataset.csv",index=False)
bottom_five.to_csv("bottom_five_cuisines_by_mismatch.csv",index=False)
bottom_five_ethnic.to_csv("bottom_five_ethnic_cuisines_by_mismatch.csv",index=False)
top_five.to_csv("top_five_cuisines_by_mismatch.csv",index=False)
top_five_ethnic.to_csv("top_five_ethnic_cuisines_by_mismatch.csv",index=False)



In [53]:
#alternative compressed file
exploded_restaurant_reviews.to_csv("final_dataset.csv",index=False,compression='gzip')