# Yelp Dataset Analysis

In [1]:
import load_data_functions
import pandas as pd

## Extract business data

The business data contains a total of 209,393 businesses.

In [2]:
df_business = load_data_functions.get_data_frame_from_file(filename=r"data/yelp_academic_dataset_business.json", fields=["business_id","name","city","stars","review_count","categories"], max_nbr_items=500000)
df_business.head()

Unnamed: 0,business_id,name,city,stars,review_count,categories
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,Cornelius,3.5,36,"Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh..."
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD",Scottsdale,5.0,4,"Health & Medical, Fitness & Instruction, Yoga,..."
2,XNoUzKckATkOD1hP6vghZg,Felinus,Montreal,5.0,5,"Pets, Pet Services, Pet Groomers"
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,North Las Vegas,2.5,3,"Hardware Stores, Home Services, Building Suppl..."
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,Mesa,4.5,26,"Home Services, Plumbing, Electricians, Handyma..."


In [3]:
df_business['stars'] = df_business.stars.astype('double')
df_business['review_count'] = df_business.review_count.astype('int')

In [4]:
df_business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209393 entries, 0 to 209392
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   209393 non-null  object 
 1   name          209393 non-null  object 
 2   city          209393 non-null  object 
 3   stars         209393 non-null  float64
 4   review_count  209393 non-null  int64  
 5   categories    208869 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.6+ MB


The business with the maximum number of reviews.

In [5]:
df_business[df_business.review_count == df_business.review_count.max()]

Unnamed: 0,business_id,name,city,stars,review_count,categories
81545,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,Las Vegas,4.0,10129,"Sandwiches, Breakfast & Brunch, Food, Restaura..."


## Business 1
We extract all of the businesses which are categorized as 'Hotels', and not 'Restaurants' to a dataset 'Business 1'.

In [14]:
df_business_1 = df_business[(df_business.review_count>1000) & (~df_business.categories.str.contains('Restaurants',regex=False,na=False)) & (df_business.categories.str.contains('Hotels',regex=False,na=False)) & (df_business.city=='Las Vegas')]
df_business_1.head()

Unnamed: 0,business_id,name,city,stars,review_count,categories
8909,_ZfjpSEO5ntk-1hbnwCR4g,Palms Place,Las Vegas,3.0,1027,"Casinos, Arts & Entertainment, Resorts, Hotels..."
11511,vnvQ0lD9MDje2DFde9PKQA,Elara by Hilton Grand Vacations - Las Vegas,Las Vegas,3.5,1015,"Vacation Rentals, Hotels, Hotels & Travel, Eve..."
17985,p0iEUamJVp_QpaheE-Nz_g,"South Point Hotel, Casino & Spa",Las Vegas,3.5,1818,"Active Life, Bowling, Cinema, Arts & Entertain..."
18943,u_vPjx925UPEG9DFOAAvFQ,Flamingo Las Vegas Hotel & Casino,Las Vegas,2.5,3905,"Arts & Entertainment, Casinos, Hotels, Hotels ..."
30491,SMPbvZLSMMb7KU76YNYMGg,ARIA Resort & Casino,Las Vegas,3.5,4580,"Arts & Entertainment, Hotels & Travel, Casinos..."


### Review count
The total number of reviews for the businesses in 'Business 1' are 65,235.

In [15]:
df_business_1.review_count.sum()

65235

### Most reviewed business
The business in 'Business 1' with the most reviews is 'ARIA Resort & Casino'.

In [16]:
top_business_1 = df_business_1[df_business_1.review_count == df_business_1.review_count.max()]
top_business_1

Unnamed: 0,business_id,name,city,stars,review_count,categories
30491,SMPbvZLSMMb7KU76YNYMGg,ARIA Resort & Casino,Las Vegas,3.5,4580,"Arts & Entertainment, Hotels & Travel, Casinos..."


#

## Business 2
We extract all of the businesses which are categorized as 'Restaurants', and not 'Hotels' to a dataset 'Business 2'.

In [6]:
df_business_2 = df_business[(df_business.review_count>1000) & (df_business.categories.str.contains('Restaurants',regex=False,na=False)) & (~df_business.categories.str.contains('Hotels',regex=False,na=False)) & (df_business.city=='Las Vegas')]
df_business_2.head()

Unnamed: 0,business_id,name,city,stars,review_count,categories
246,AtD6B83S4Mbmq0t7iDnUVA,Veggie House,Las Vegas,4.5,1142,"Restaurants, Specialty Food, Japanese, Sushi B..."
2575,7sb2FYLS2sejZKxRYF9mtg,Sakana,Las Vegas,4.5,2529,"Restaurants, Sushi Bars, Buffets, Japanese, Ba..."
3653,rcaPajgKOJC2vo_l3xa42A,Bouchon,Las Vegas,4.0,4138,"French, Cafes, Restaurants, Cocktail Bars, Nig..."
4030,lmxA0dJM0XsPCIHPXhEQ-g,Sake Rok,Las Vegas,4.0,1110,"Restaurants, Asian Fusion, Japanese, Bars, Sus..."
4269,e13SEvJud_vgeDR_doL4sQ,Hussong's Cantina Las Vegas,Las Vegas,4.0,1050,"Restaurants, Bars, Nightlife, Mexican, Burgers..."


### Review count
The total number of reviews for the businesses in 'Business 2' are 484,659.

In [7]:
df_business_2.review_count.sum()

484659

### Most reviewed business
The business in 'Business 2' with the most reviews is 'Bacchanal Buffet'.

In [8]:
top_business_2 = df_business_2[df_business_2.review_count == df_business_2.review_count.max()]
top_business_2

Unnamed: 0,business_id,name,city,stars,review_count,categories
81545,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,Las Vegas,4.0,10129,"Sandwiches, Breakfast & Brunch, Food, Restaura..."


In [24]:
print(top_business_2.categories.values)

['Sandwiches, Breakfast & Brunch, Food, Restaurants, Buffets']


In [27]:
top_business_2.review_count.values[0]

10129

## Extract review data

In [28]:
df = load_data_functions.get_data_frame_from_file(filename="data/yelp_academic_dataset_review.json", fields=["user_id","business_id","stars","useful","text","date"], max_nbr_items=top_business_2.review_count.values[0], wanted_values={'business_id': list(top_business_2.business_id)})
df.head()

Unnamed: 0,user_id,business_id,stars,useful,text,date
0,6PgdGb3HrZdsfl2GiULo8w,RESDUcs7fIiihp38-d6_6g,5.0,0,After getting food poisoning at the Palms hote...,2012-12-04 03:10:18
1,IS9yw8P2uAPBX6FNLLX4KA,RESDUcs7fIiihp38-d6_6g,4.0,39,"""A feast worthy of Gods""\n\nBaccarnal Buffet i...",2014-01-17 00:50:50
2,uZdFsE_aHbFBChgN6Xa8tw,RESDUcs7fIiihp38-d6_6g,4.0,1,The crab legs are better than the ones at Wick...,2015-06-08 18:03:09
3,8ZWJNAEWsymXDzKx3B0tTQ,RESDUcs7fIiihp38-d6_6g,1.0,0,Not worth it! Too salty food and expensive! Th...,2016-12-19 16:15:29
4,E0sm4Ve7ifanFYeQMcV8Eg,RESDUcs7fIiihp38-d6_6g,5.0,0,I would give this infinite stars if I could. M...,2015-07-28 07:13:17


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10129 entries, 0 to 10128
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      10129 non-null  object
 1   business_id  10129 non-null  object
 2   stars        10129 non-null  object
 3   useful       10129 non-null  int64 
 4   text         10129 non-null  object
 5   date         10129 non-null  object
dtypes: int64(1), object(5)
memory usage: 474.9+ KB


In [30]:
df['date'] = pd.to_datetime(df['date'])
df['text_length'] = df.text.apply(lambda x: len(x))
df['stars'] = df.stars.astype('int')
df['nbr_words'] = df.text.apply(lambda x: len(x.split(' ')))
df.head()

Unnamed: 0,user_id,business_id,stars,useful,text,date,text_length,nbr_words
0,6PgdGb3HrZdsfl2GiULo8w,RESDUcs7fIiihp38-d6_6g,5,0,After getting food poisoning at the Palms hote...,2012-12-04 03:10:18,937,176
1,IS9yw8P2uAPBX6FNLLX4KA,RESDUcs7fIiihp38-d6_6g,4,39,"""A feast worthy of Gods""\n\nBaccarnal Buffet i...",2014-01-17 00:50:50,4975,940
2,uZdFsE_aHbFBChgN6Xa8tw,RESDUcs7fIiihp38-d6_6g,4,1,The crab legs are better than the ones at Wick...,2015-06-08 18:03:09,671,131
3,8ZWJNAEWsymXDzKx3B0tTQ,RESDUcs7fIiihp38-d6_6g,1,0,Not worth it! Too salty food and expensive! Th...,2016-12-19 16:15:29,92,18
4,E0sm4Ve7ifanFYeQMcV8Eg,RESDUcs7fIiihp38-d6_6g,5,0,I would give this infinite stars if I could. M...,2015-07-28 07:13:17,333,63


In [31]:
df_no_zero_useful = df[df.useful>0]
df_no_zero_useful.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 177697 entries, 4 to 484653
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   user_id      177697 non-null  object        
 1   business_id  177697 non-null  object        
 2   stars        177697 non-null  int32         
 3   useful       177697 non-null  int64         
 4   text         177697 non-null  object        
 5   date         177697 non-null  datetime64[ns]
 6   text_length  177697 non-null  int64         
 7   nbr_words    177697 non-null  int64         
dtypes: datetime64[ns](1), int32(1), int64(3), object(3)
memory usage: 11.5+ MB


### Save the review data to a csv file

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10129 entries, 0 to 10128
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      10129 non-null  object        
 1   business_id  10129 non-null  object        
 2   stars        10129 non-null  int64         
 3   useful       10129 non-null  int64         
 4   text         10129 non-null  object        
 5   date         10129 non-null  datetime64[ns]
 6   text_length  10129 non-null  int64         
 7   nbr_words    10129 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 633.2+ KB


In [33]:
data_folder = "data/"
save_to_file = input(f"Specify the filename you would like to save the current df to: \n{data_folder}  ")
df.to_csv(data_folder + save_to_file, index=False)