Go to this URL to download the [Yelp dataset](https://www.yelp.com/dataset/download)

Note: you will need to add some personal information to get to the download button.  Click the Download JSON button.

Open the .tar file and save the following files to the data folder: <br>- business.json <br>- review.json

In [1]:
import pandas as pd
import json
from datetime import datetime
import re
import multiprocessing as mp
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import numpy as np

In [44]:
# May not need these
# import seaborn as sns
# import requests as re
# import string, itertools
# from pandas import json_normalize
# from collections import Counter, defaultdict
# import nltk
# from nltk.text import Text
# from nltk.probability import FreqDist
# from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer, WordNetLemmatizer
# from gensim.corpora.dictionary import Dictionary
# from gensim.models.tfidfmodel import TfidfModel
# from sklearn.cluster import KMeans
# from wordcloud import WordCloud

In [2]:
df_b = pd.read_json("../data/yelp_academic_dataset_business.json", lines=True)

In [62]:
df_b.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [63]:
df_b.describe

<bound method NDFrame.describe of                    business_id                      name  \
0       Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1       mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2       tUFrWirKiKi_TAnsVWINQQ                    Target   
3       MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4       mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   
...                        ...                       ...   
150341  IUQopTMmYQG-qRtBk-8QnA              Binh's Nails   
150342  c8GjPIOTGVmIemT7j5_SyQ      Wild Birds Unlimited   
150343  _QAMST-NrQobXduilWEqSw         Claire's Boutique   
150344  mtGm22y5c2UHNXDFAjaPNw  Cyclery & Fitness Center   
150345  jV_XOycEzSlTx-65W906pg                   Sic Ink   

                                address           city state postal_code  \
0                1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1       87 Grasso Plaza Shopping Center         Affton    MO       63123   
2                

Drop irrelevant columns and filter businesses to those with a category of 'Restaurants' or 'Food'

In [64]:
drop_columns = ['hours','review_count']
df_b = df_b.drop(drop_columns, axis=1)

In [11]:
pd.DataFrame(df_b['categories'].unique()).to_csv("../data/business_categories.csv")

In [65]:
df_food_bus = df_b[df_b['categories'].str.contains(
              'Restaurants|Food',
              case=False, na=False)]

In [66]:
df_b.shape

(150346, 12)

In [67]:
df_food_bus.shape

(64616, 12)

64,616 food and restaurant businesses

In [68]:
review_json_path = "../data/yelp_academic_dataset_review.json"
size = 500000
reviews = pd.read_json(review_json_path, lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

In [69]:
chunk_list = []
for chunk_review in reviews:
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['review_id','funny','cool'], axis=1)
    # Renaming column name to avoid conflict with business overall star rating
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    # Inner merge with edited business file so only reviews related to the business remain
    chunk_merged = pd.merge(df_food_bus, chunk_review, on='business_id', how='inner')
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
# After trimming down the review file, concatenate all relevant data back to one dataframe
df_reviews = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

377672 out of 500,000 related reviews
371259 out of 500,000 related reviews
358231 out of 500,000 related reviews
372278 out of 500,000 related reviews
371062 out of 500,000 related reviews
354838 out of 500,000 related reviews
351471 out of 500,000 related reviews
375705 out of 500,000 related reviews
372789 out of 500,000 related reviews
358707 out of 500,000 related reviews
369177 out of 500,000 related reviews
377414 out of 500,000 related reviews
364504 out of 500,000 related reviews
351033 out of 500,000 related reviews


In [70]:
df_reviews.shape

(5126140, 17)

In [71]:
df_reviews.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,is_open,attributes,categories,user_id,review_stars,useful,text,date
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",6_SpY41LIHZuIaiDs5FMKA,4,0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",tCXElwhzekJEH6QJe3xs7Q,4,3,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",WqfKtI-aGMmvbA9pPUxNQQ,5,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",3-1va0IQfK-9tUMzfHWfTA,5,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",EouCKoDfzaVG0klEgdDvCQ,4,2,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35


In [72]:
df_reviews = df_reviews.drop('attributes', axis=1)

In [73]:
df_reviews["state"].unique()

array(['PA', 'TN', 'MO', 'FL', 'IN', 'AB', 'NV', 'IL', 'AZ', 'LA', 'NJ',
       'CA', 'DE', 'ID', 'NC', 'CO', 'HI', 'MT', 'XMS'], dtype=object)

In [74]:
## filter restaurants of US
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
df_reviews_usa=df_reviews.loc[df_reviews['state'].isin(states)]
df_reviews_usa.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,is_open,categories,user_id,review_stars,useful,text,date
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",6_SpY41LIHZuIaiDs5FMKA,4,0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",tCXElwhzekJEH6QJe3xs7Q,4,3,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",WqfKtI-aGMmvbA9pPUxNQQ,5,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",3-1va0IQfK-9tUMzfHWfTA,5,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",EouCKoDfzaVG0klEgdDvCQ,4,2,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35


In [75]:
df_reviews_usa["state"].unique()

array(['PA', 'TN', 'MO', 'FL', 'IN', 'NV', 'IL', 'AZ', 'LA', 'NJ', 'CA',
       'DE', 'ID', 'NC', 'CO', 'HI', 'MT'], dtype=object)

In [76]:
df_reviews_usa.dtypes

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
is_open           int64
categories       object
user_id          object
review_stars      int64
useful            int64
text             object
date             object
dtype: object

In [77]:
df_reviews_usa = df_reviews_usa.astype({"stars": int})

In [78]:
df_reviews_usa.dtypes

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars             int64
is_open           int64
categories       object
user_id          object
review_stars      int64
useful            int64
text             object
date             object
dtype: object

In [79]:
df_reviews_usa.date = pd.to_datetime(df_reviews_usa.date)

In [80]:
df_reviews_usa.dtypes

business_id             object
name                    object
address                 object
city                    object
state                   object
postal_code             object
latitude               float64
longitude              float64
stars                    int64
is_open                  int64
categories              object
user_id                 object
review_stars             int64
useful                   int64
text                    object
date            datetime64[ns]
dtype: object

In [81]:
df_reviews_usa = df_reviews_usa.rename(columns={'stars':'avg_stars'})

In [82]:
df_reviews_usa.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'avg_stars', 'is_open', 'categories',
       'user_id', 'review_stars', 'useful', 'text', 'date'],
      dtype='object')

In [83]:
df_reviews_usa.date.min()

Timestamp('2005-02-16 03:23:22')

In [84]:
df_reviews_usa.date.max()

Timestamp('2022-01-19 19:48:25')

In [85]:
df_reviews_usa['sentiment'] = ''

In [53]:
# label reviews as positive or negative
df_reviews_usa.loc[df_reviews_usa.review_stars >=4, 'sentiment'] = 'positive'
df_reviews_usa.loc[df_reviews_usa.review_stars ==3, 'sentiment'] = 'neural'
df_reviews_usa.loc[df_reviews_usa.review_stars <3, 'sentiment'] = 'negative'

In [86]:
df_reviews_usa.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,avg_stars,is_open,categories,user_id,review_stars,useful,text,date,sentiment
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",6_SpY41LIHZuIaiDs5FMKA,4,0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53,
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",tCXElwhzekJEH6QJe3xs7Q,4,3,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06,
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",WqfKtI-aGMmvbA9pPUxNQQ,5,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57,
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",3-1va0IQfK-9tUMzfHWfTA,5,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57,
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",EouCKoDfzaVG0klEgdDvCQ,4,2,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35,


In [87]:
df_reviews_usa['text_cleansed'] = ''

In [88]:
## converts text field to lowercase and saves in a new text_cleansed column
df_reviews_usa.loc[:, 'text_cleansed'] = df_reviews_usa.text.str.lower()

In [89]:
## Removes unnecessary punctuation or adds spaces in place of punctuation
df_reviews_usa.loc[:, 'text_cleansed']= df_reviews_usa.text_cleansed.str.replace('(\\n)|[!.,?-]', ' ', regex=True) \
                                                             .str.replace("[-|':$\\~\"#%&\(\)*+/:;<=>?@\[\]^_`{}]",'', regex=True)

In [90]:
df_reviews_usa.head(20)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,avg_stars,is_open,categories,user_id,review_stars,useful,text,date,sentiment,text_cleansed
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",6_SpY41LIHZuIaiDs5FMKA,4,0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53,,this is nice little chinese bakery in the hear...
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",tCXElwhzekJEH6QJe3xs7Q,4,3,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06,,this is the bakery i usually go to in chinatow...
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",WqfKtI-aGMmvbA9pPUxNQQ,5,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57,,a delightful find in chinatown very clean an...
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",3-1va0IQfK-9tUMzfHWfTA,5,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57,,i ordered a graduation cake for my niece and i...
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",EouCKoDfzaVG0klEgdDvCQ,4,2,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35,,hk style milk tea four stars not quite sure ...
5,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",KQSRUu4Aapl0hG6eu2v8iw,4,1,This is my favorite bakery in Chinatown! I usu...,2018-03-17 23:47:25,,this is my favorite bakery in chinatown i usu...
6,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",X_DkwPTzdO_VWzUcbUXREg,3,0,so I dropped in to this store since it was the...,2012-08-16 19:42:17,,so i dropped in to this store since it was the...
7,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",qUfRCH5NUyRDsJfM6jA5PQ,4,3,Impression: The good thing is that there are l...,2017-04-20 12:37:09,,impression the good thing is that there are li...
8,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",gNJNxucGoZ31nlH74EQpPg,4,0,Best egg tarts? \n\nI was lucky to get them fr...,2014-02-25 14:41:08,,best egg tarts i was lucky to get them fres...
9,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",NMOxipsnXc6olWdHYzXiYA,4,0,Little square cakes of happiness with cute bun...,2012-01-10 02:09:18,,little square cakes of happiness with cute bun...


In [91]:
## import veg_words file
veg_words = pd.read_csv('../data/veg_words.csv')

In [92]:
veg_words.head()

Unnamed: 0,word
0,vegetarian
1,vegan
2,plant based
3,plant based
4,plantbased


In [93]:
veg_word_list = veg_words["word"].tolist()

In [100]:
def words_matched(text):
    words = []
    word_list = veg_word_list
    for word in word_list:
        regexp = rf"\b{word}\b"
        if re.search(regexp, text):
            words.append(word)
    return words

In [62]:
def count_words_matched(text):
    cnt = 0
    for word in veg_word_list:
        regexp = rf"\b{word}\b"
        if re.search(regexp, text):
            cnt = cnt + 1
    return cnt

In [64]:
df_reviews_usa['num_words_matched'] = 0

In [95]:
df_sample = df_reviews_usa.head(100000)

In [96]:
df_sample.shape

(100000, 18)

In [97]:
from pandarallel import pandarallel
import numpy as np

In [118]:
pandarallel.initialize()
s_words_matched = df_sample['text_cleansed'].parallel_apply(words_matched)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [157]:
s_words_matched

0         [bean]
1         [bean]
2             []
3             []
4             []
           ...  
101608        []
101609        []
101610        []
101611        []
101612        []
Name: text_cleansed, Length: 100000, dtype: object

In [138]:
s_num_matches = list(map(len, s_words_matched))

In [147]:
print(s_num_matches[slice(100)])

[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [69]:
# old
# %%time
# df_sample.loc[:,'num_words_matched'] = df_sample.apply(lambda row : count_words_matched(row['text_cleansed']), axis = 1)

CPU times: user 26.1 s, sys: 534 ms, total: 26.6 s
Wall time: 27.5 s


In [143]:
df_sample.loc[:,'num_words_matched'] = s_num_matches

In [151]:
def join_words(arr):
    return ' '.join(arr)

In [152]:
s_words_matched_joined = list(map(join_words, s_words_matched))

In [154]:
print(s_words_matched_joined[slice(1000)])

['bean', 'bean', '', '', '', 'bean', '', '', '', '', '', 'soy', '', '', '', '', '', '', '', '', '', '', '', '', 'bean', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'salad', 'salad', '', '', '', '', '', 'salad', '', 'salads', '', '', '', 'salads', 'salad', 'salad', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'salads', '', '', '', '', '', '', 'salads', '', '', '', 'salad', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'salad', '', 'veggies veggies', '', '', '', 'salads', '', '', '', 'salad', '', '', '', '', '', '', '', 'salad', '', '', '', '', 'salads', 'salad', '', '', '', '', '', '', '', '', '', '', '', 'salad', '', '', 'salad', 'veg', '', '', '', '', '', 'salad salads', '', '', '', '', '', '', '', '', '', '', '', '', 'salad', '', '', '', '', '', '

In [155]:
df_sample.loc[:,'text_matched'] = s_words_matched_joined

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample.loc[:,'text_matched'] = s_words_matched_joined


In [156]:
df_sample.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,avg_stars,is_open,categories,user_id,review_stars,useful,text,date,sentiment,text_cleansed,num_words_matched,text_matched
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",6_SpY41LIHZuIaiDs5FMKA,4,0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53,,this is nice little chinese bakery in the hear...,1,bean
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",tCXElwhzekJEH6QJe3xs7Q,4,3,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06,,this is the bakery i usually go to in chinatow...,1,bean
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",WqfKtI-aGMmvbA9pPUxNQQ,5,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57,,a delightful find in chinatown very clean an...,0,
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",3-1va0IQfK-9tUMzfHWfTA,5,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57,,i ordered a graduation cake for my niece and i...,0,
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",EouCKoDfzaVG0klEgdDvCQ,4,2,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35,,hk style milk tea four stars not quite sure ...,0,


In [70]:
len(df_sample[df_sample["num_words_matched"]>0])

16003

In [77]:
# This works but takes a long time, would be good to do this in batches instead
df_reviews_usa.loc[:, 'num_words_matched'] = df_reviews_usa.apply(lambda row : count_words_matched(row['text_cleansed']), axis = 1)

In [78]:
df_reviews_usa.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,avg_stars,is_open,categories,user_id,review_stars,useful,text,date,sentiment,text_cleansed,num_words_matched
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",6_SpY41LIHZuIaiDs5FMKA,4,0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53,positive,this is nice little chinese bakery in the hear...,1
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",tCXElwhzekJEH6QJe3xs7Q,4,3,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06,positive,this is the bakery i usually go to in chinatow...,1
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",WqfKtI-aGMmvbA9pPUxNQQ,5,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57,positive,a delightful find in chinatown very clean an...,0
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",3-1va0IQfK-9tUMzfHWfTA,5,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57,positive,i ordered a graduation cake for my niece and i...,0
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",EouCKoDfzaVG0klEgdDvCQ,4,2,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35,positive,hk style milk tea four stars not quite sure ...,0


In [103]:
def process_batch(df, chunksize):
    first_num = 0
    sec_num = chunksize
    while sec_num <= len(df):
        df.loc[first_num:sec_num, 'num_words_matched_new'] = df.apply(lambda row : count_words_matched(row['text_cleansed']), axis = 1)
        first_num = first_num + chunksize
        sec_num = sec_num + chunksize
        print(f"{first_num} to {sec_num} completed")
    df.loc[sec_num:, 'num_words_matched_new'] = df.apply(lambda row : count_words_matched(row['text_cleansed']), axis = 1)
    print(f"{sec_num} to the end completed")

In [109]:
df_reviews_usa['num_words_matched_new'] = 0

In [106]:
process_batch(df_reviews_usa, 100000)

KeyboardInterrupt: 

In [80]:
df_reviews_usa.loc[0].text

"This is nice little Chinese bakery in the heart of Philadelphia's Chinatown! The female cashier was very friendly (flirtatious!) and the pastries shown in nicely adorned display cases. I stopped by early one evening had a sesame ball, which was filled with bean paste. The glutinous rice of the ball was nicely flavored, similar to Bai Tang Gao. Definitely as place worth stopping at if you are in the area."

In [81]:
df_reviews_usa.to_parquet("../data/reviews2.parquet")

In [4]:
df = pd.read_parquet("../data/reviews.parquet")

In [5]:
df.head(50)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,avg_stars,is_open,attributes,categories,user_id,review_stars,useful,text,date,sentiment,text_cleansed
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",6_SpY41LIHZuIaiDs5FMKA,4,0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53,positive,this is nice little chinese bakery in the hear...
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",tCXElwhzekJEH6QJe3xs7Q,4,3,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06,positive,this is the bakery i usually go to in chinatow...
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",WqfKtI-aGMmvbA9pPUxNQQ,5,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57,positive,a delightful find in chinatown very clean an...
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",3-1va0IQfK-9tUMzfHWfTA,5,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57,positive,i ordered a graduation cake for my niece and i...
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",EouCKoDfzaVG0klEgdDvCQ,4,2,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35,positive,hk style milk tea four stars not quite sure ...
5,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",KQSRUu4Aapl0hG6eu2v8iw,4,1,This is my favorite bakery in Chinatown! I usu...,2018-03-17 23:47:25,positive,this is my favorite bakery in chinatown i usu...
6,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",X_DkwPTzdO_VWzUcbUXREg,3,0,so I dropped in to this store since it was the...,2012-08-16 19:42:17,neural,so i dropped in to this store since it was the...
7,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",qUfRCH5NUyRDsJfM6jA5PQ,4,3,Impression: The good thing is that there are l...,2017-04-20 12:37:09,positive,impression the good thing is that there are li...
8,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",gNJNxucGoZ31nlH74EQpPg,4,0,Best egg tarts? \n\nI was lucky to get them fr...,2014-02-25 14:41:08,positive,best egg tarts i was lucky to get them fres...
9,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",NMOxipsnXc6olWdHYzXiYA,4,0,Little square cakes of happiness with cute bun...,2012-01-10 02:09:18,positive,little square cakes of happiness with cute bun...


Output df to json

In [3]:
df_yelp = pd.read_parquet('../data/reviews.parquet')

In [4]:
df_yelp.to_json('../data/reviews.json', orient = 'split', compression = 'infer')

In [5]:
df_json = pd.read_json('../data/reviews.json')

ValueError: All arrays must be of the same length

In [None]:
df_json.head()

In [6]:
df_r = pd.read_parquet('../data/reviews.parquet')

In [7]:
df_r.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,avg_stars,is_open,categories,review_id,user_id,review_stars,useful,text,date,sentiment,text_cleansed
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",BXQcBN0iAi1lAUxibGLFzA,6_SpY41LIHZuIaiDs5FMKA,4,0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53,positive,this is nice little chinese bakery in the hear...
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",uduvUCvi9w3T2bSGivCfXg,tCXElwhzekJEH6QJe3xs7Q,4,3,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06,positive,this is the bakery i usually go to in chinatow...
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",a0vwPOqDXXZuJkbBW2356g,WqfKtI-aGMmvbA9pPUxNQQ,5,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57,positive,a delightful find in chinatown very clean an...
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",MKNp_CdR2k2202-c8GN5Dw,3-1va0IQfK-9tUMzfHWfTA,5,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57,positive,i ordered a graduation cake for my niece and i...
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",D1GisLDPe84Rrk_R4X2brQ,EouCKoDfzaVG0klEgdDvCQ,4,2,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35,positive,hk style milk tea four stars not quite sure ...


In [5]:
df_r = pd.read_parquet("../data/reviews.parquet")

In [16]:
df_r.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'avg_stars', 'is_open', 'categories',
       'review_id', 'user_id', 'review_stars', 'useful', 'text', 'date',
       'sentiment', 'text_cleansed', 'num_words_matched'],
      dtype='object')

In [18]:
len(df_r)

5049085

In [24]:
mask = (df_r['date'] > '2005-01-01') & (df_r['date'] <= '2014-12-31')
len(df_r.loc[mask])

1256072

In [33]:
mask = (df_r['date'] > '2014-12-31') & (df_r['date'] <= '2016-12-31')
len(df_r.loc[mask])

1041694

In [35]:
mask = (df_r['date'] > '2016-12-31') & (df_r['date'] <= '2018-12-31')
len(df_r.loc[mask])

1238554

In [36]:
mask = (df_r['date'] > '2018-12-31')
len(df_r.loc[mask])

1512765

In [37]:
df_r.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,avg_stars,is_open,categories,review_id,user_id,review_stars,useful,text,date,sentiment,text_cleansed,num_words_matched
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",BXQcBN0iAi1lAUxibGLFzA,6_SpY41LIHZuIaiDs5FMKA,4,0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53,positive,this is nice little chinese bakery in the hear...,1
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",uduvUCvi9w3T2bSGivCfXg,tCXElwhzekJEH6QJe3xs7Q,4,3,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06,positive,this is the bakery i usually go to in chinatow...,1
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",a0vwPOqDXXZuJkbBW2356g,WqfKtI-aGMmvbA9pPUxNQQ,5,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57,positive,a delightful find in chinatown very clean an...,0
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",MKNp_CdR2k2202-c8GN5Dw,3-1va0IQfK-9tUMzfHWfTA,5,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57,positive,i ordered a graduation cake for my niece and i...,0
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",D1GisLDPe84Rrk_R4X2brQ,EouCKoDfzaVG0klEgdDvCQ,4,2,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35,positive,hk style milk tea four stars not quite sure ...,0


In [39]:
mask1 = (df_r['date'] > '2005-01-01') & (df_r['date'] <= '2014-12-31')
df_r.loc[mask1].to_parquet("../data/reviews/reviews1.parquet", compression="gzip")

In [40]:
mask2 = (df_r['date'] > '2014-12-31') & (df_r['date'] <= '2016-12-31')
df_r.loc[mask2].to_parquet("../data/reviews/reviews2.parquet", compression="gzip")

In [41]:
mask3 = (df_r['date'] > '2016-12-31') & (df_r['date'] <= '2018-12-31')
df_r.loc[mask3].to_parquet("../data/reviews/reviews3.parquet", compression="gzip")

In [42]:
mask4 = (df_r['date'] > '2018-12-31')
df_r.loc[mask4].to_parquet("../data/reviews/reviews4.parquet", compression="gzip")

In [43]:
mask = (df_r['date'] > '2019-12-31')
len(df_r.loc[mask])

860103

In [44]:
df_r['state'].unique()

array(['PA', 'TN', 'MO', 'FL', 'IN', 'NV', 'IL', 'AZ', 'LA', 'NJ', 'CA',
       'DE', 'ID', 'NC', 'CO', 'HI', 'MT'], dtype=object)

In [45]:
len(df_r)

5049085

In [46]:
len(df_r) / 4

1262271.25

In [57]:
factor = np.ceil(len(df_r) / 4).astype(int)

In [30]:
factor

1262272

In [45]:
df_list = []

In [46]:
df_list.append(df_r.iloc[(0 * factor):(1 * factor)])

In [47]:
df_list.append(df_r.iloc[(1 * factor):(2 * factor)])

In [48]:
df_list.append(df_r.iloc[(2 * factor):(3 * factor)])

In [40]:
4 * factor

5049088

In [49]:
df_list.append(df_r.iloc[(3 * factor):])

In [53]:
def length(df):
    return len(df)

In [54]:
lengths = map(length, df_list)

In [55]:
list(lengths)

[1262272, 1262272, 1262272, 1262269]

In [58]:
for x in range(4):
  print(x)

0
1
2
3


In [59]:
for x in range(4):
    df_list[x].to_parquet(f"../data/reviews/reviews_{x}.parquet", compression="gzip")