# Yelp Dataset Analysis

In [9]:
import load_data_functions
import pandas as pd

## Extract business data

The business data contains a total of 209,393 businesses.

In [10]:
df_business = load_data_functions.get_data_frame_from_file(filename=r"C:\Users\lovis\Projects\Causal Project\yelp_dataset\yelp_academic_dataset_business.json", fields=["business_id","name","city","stars","review_count","categories"], max_nbr_items=500000)
df_business.head()

Unnamed: 0,business_id,name,city,stars,review_count,categories
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,Cornelius,3.5,36,"Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh..."
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD",Scottsdale,5.0,4,"Health & Medical, Fitness & Instruction, Yoga,..."
2,XNoUzKckATkOD1hP6vghZg,Felinus,Montreal,5.0,5,"Pets, Pet Services, Pet Groomers"
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,North Las Vegas,2.5,3,"Hardware Stores, Home Services, Building Suppl..."
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,Mesa,4.5,26,"Home Services, Plumbing, Electricians, Handyma..."


In [11]:
df_business['stars'] = df_business.stars.astype('double')
df_business['review_count'] = df_business.review_count.astype('int')

In [12]:
df_business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209393 entries, 0 to 209392
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   209393 non-null  object 
 1   name          209393 non-null  object 
 2   city          209393 non-null  object 
 3   stars         209393 non-null  float64
 4   review_count  209393 non-null  int32  
 5   categories    208869 non-null  object 
dtypes: float64(1), int32(1), object(4)
memory usage: 8.8+ MB


The business with the maximum number of reviews.

In [13]:
df_business[df_business.review_count == df_business.review_count.max()]

Unnamed: 0,business_id,name,city,stars,review_count,categories
81545,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,Las Vegas,4.0,10129,"Sandwiches, Breakfast & Brunch, Food, Restaura..."


## Business 1
We extract all of the businesses which are categorized as 'Hotels', and not 'Restaurants' to a dataset 'Business 1'.

In [14]:
df_business_1 = df_business[(df_business.review_count>1000) & (~df_business.categories.str.contains('Restaurants',regex=False,na=False)) & (df_business.categories.str.contains('Hotels',regex=False,na=False)) & (df_business.city=='Las Vegas')]
df_business_1.head()

Unnamed: 0,business_id,name,city,stars,review_count,categories
8909,_ZfjpSEO5ntk-1hbnwCR4g,Palms Place,Las Vegas,3.0,1027,"Casinos, Arts & Entertainment, Resorts, Hotels..."
11511,vnvQ0lD9MDje2DFde9PKQA,Elara by Hilton Grand Vacations - Las Vegas,Las Vegas,3.5,1015,"Vacation Rentals, Hotels, Hotels & Travel, Eve..."
17985,p0iEUamJVp_QpaheE-Nz_g,"South Point Hotel, Casino & Spa",Las Vegas,3.5,1818,"Active Life, Bowling, Cinema, Arts & Entertain..."
18943,u_vPjx925UPEG9DFOAAvFQ,Flamingo Las Vegas Hotel & Casino,Las Vegas,2.5,3905,"Arts & Entertainment, Casinos, Hotels, Hotels ..."
30491,SMPbvZLSMMb7KU76YNYMGg,ARIA Resort & Casino,Las Vegas,3.5,4580,"Arts & Entertainment, Hotels & Travel, Casinos..."


### Review count
The total number of reviews for the businesses in 'Business 1' are 65,235.

In [15]:
df_business_1.review_count.sum()

65235

### Most reviewed business
The business in 'Business 1' with the most reviews is 'ARIA Resort & Casino'.

In [16]:
top_business_1 = df_business_1[df_business_1.review_count == df_business_1.review_count.max()]
top_business_1

Unnamed: 0,business_id,name,city,stars,review_count,categories
30491,SMPbvZLSMMb7KU76YNYMGg,ARIA Resort & Casino,Las Vegas,3.5,4580,"Arts & Entertainment, Hotels & Travel, Casinos..."


### Save business ids of 'Business 1' to a csv file

In [42]:
print(df_business_1.business_id)
df_business_1.business_id.to_csv(r"C:\Users\lovis\Projects\yelp_causal_inference\data\business_ids_Hotels.csv", index=False)

8909      _ZfjpSEO5ntk-1hbnwCR4g
11511     vnvQ0lD9MDje2DFde9PKQA
17985     p0iEUamJVp_QpaheE-Nz_g
18943     u_vPjx925UPEG9DFOAAvFQ
30491     SMPbvZLSMMb7KU76YNYMGg
33183     fWetLvfQXqpZcBQLHldUhA
53030     WYw3Uf56DT5IwpaLNnCH5Q
59274     na4Th5DrNauOv-c43QQFvA
62235     e0CTLPxTnFEQSqQ1FJUqog
64234     Az_60nNuh1FH8Ds8oasZjw
76448     FaHADZARwnY4yvlvpnsfGA
83279     BLIJ-p5wYuAhw6Pp6mh6mw
87339     dWFUKB_HPBIE87AFBHEb_w
90458     6Ct57qgmXwOnzfSZoUGh0Q
96940     5iSmZO0SrKU6EoXK_1M8Kw
101200    ZjSzUWHtnpCfjsa7CksSOg
124063    o7AiTlyWUrBSzdz6oMHj5w
129704    eEnNw3_hBvxcFHyr23kAuA
130311    NY80DkkCfEl198JmwtO4pA
132076    yhgUyctWczUN13MinNgZ3w
134905    VyjyHoBg3KC5BSFRlD0ZPQ
144733    hrhtWc8UcoZqYr2d7YPgEA
145383    t-o_Sraneime4DDhWrQRBA
149867    9SU7ZZhaFUJJ6m2k5HKHeg
172561    g83WbX_recywc4DEIZ-xug
183166    qjnJFZtsY_nfRzoL3J_UWQ
189685    ThNGovQZjZ5mn-ZwX_N2BQ
194036    XQETjKH84gxDrAiz2lH7Wg
Name: business_id, dtype: object


#

## Business 2
We extract all of the businesses which are categorized as 'Restaurants', and not 'Hotels' to a dataset 'Business 2'.

In [22]:
df_business_2 = df_business[(df_business.review_count>1000) & (df_business.categories.str.contains('Restaurants',regex=False,na=False)) & (~df_business.categories.str.contains('Hotels',regex=False,na=False)) & (df_business.city=='Las Vegas')]
df_business_2.head()

Unnamed: 0,business_id,name,city,stars,review_count,categories
246,AtD6B83S4Mbmq0t7iDnUVA,Veggie House,Las Vegas,4.5,1142,"Restaurants, Specialty Food, Japanese, Sushi B..."
2575,7sb2FYLS2sejZKxRYF9mtg,Sakana,Las Vegas,4.5,2529,"Restaurants, Sushi Bars, Buffets, Japanese, Ba..."
3653,rcaPajgKOJC2vo_l3xa42A,Bouchon,Las Vegas,4.0,4138,"French, Cafes, Restaurants, Cocktail Bars, Nig..."
4030,lmxA0dJM0XsPCIHPXhEQ-g,Sake Rok,Las Vegas,4.0,1110,"Restaurants, Asian Fusion, Japanese, Bars, Sus..."
4269,e13SEvJud_vgeDR_doL4sQ,Hussong's Cantina Las Vegas,Las Vegas,4.0,1050,"Restaurants, Bars, Nightlife, Mexican, Burgers..."


### Review count
The total number of reviews for the businesses in 'Business 2' are 484,659.

In [23]:
df_business_2.review_count.sum()

484659

### Most reviewed business
The business in 'Business 2' with the most reviews is 'Bacchanal Buffet'.

In [49]:
top_business_2 = df_business_2[df_business_2.review_count == df_business_2.review_count.max()]
top_business_2

Unnamed: 0,business_id,name,city,stars,review_count,categories
81545,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,Las Vegas,4.0,10129,"Sandwiches, Breakfast & Brunch, Food, Restaura..."


### Save business ids of 'Business 2' to a csv file

In [45]:
print(df_business_2.business_id)
df_business_2.business_id.to_csv(r"C:\Users\lovis\Projects\yelp_causal_inference\data\business_ids_Restaurants.csv", index=False)

246       AtD6B83S4Mbmq0t7iDnUVA
2575      7sb2FYLS2sejZKxRYF9mtg
3653      rcaPajgKOJC2vo_l3xa42A
4030      lmxA0dJM0XsPCIHPXhEQ-g
4269      e13SEvJud_vgeDR_doL4sQ
                   ...          
204952    ugLqbAvBdRDc-gS4hpslXw
206247    sqRX-XLlhx4rs2c1TpBf8A
206374    -ed0Yc9on37RoIoG2ZgxBA
206401    G-5kEa6E6PD5fkBRuA7k9Q
207774    PQER4ba8Q0zqB1G2QiXamQ
Name: business_id, Length: 262, dtype: object


## Extract review data

In [25]:
df = load_data_functions.get_data_frame_from_file(filename=r"C:\Users\lovis\Projects\Causal Project\yelp_dataset\yelp_academic_dataset_review.json", fields=["user_id","business_id","stars","useful","text","date"], max_nbr_items=df_business_2.review_count.sum(), wanted_values={'business_id': list(df_business_2.business_id)})
df.head()

Unnamed: 0,user_id,business_id,stars,useful,text,date
0,2hRe26HSCAWbFRn5WChK-Q,d4qwVw4PcN-_2mK2o1Ro1g,1.0,0,10pm on a super bowl Sunday and they're alread...,2015-02-02 06:28:00
1,RR-2nouBn408e3djxC470g,d4qwVw4PcN-_2mK2o1Ro1g,5.0,0,Holy heck this place is amazing. I love their ...,2017-02-09 04:25:03
2,-Co-ReNx_lXT1xL_Rr0B2g,XZbuPXdyA0ZtTu3AzqtQhg,4.0,0,"As the previous person posted, what more can r...",2009-10-13 09:50:48
3,4xIRICDNx33zPG-CYshTXQ,IhNASEZ3XnBHmuuVnWdIwA,5.0,0,If you are looking for something refreshing an...,2015-07-16 06:46:29
4,DMtVkV1K2DPimItj9xUfjw,XZbuPXdyA0ZtTu3AzqtQhg,4.0,1,Bobby Flay's restaurant at Caesar's Palace. T...,2012-11-06 06:00:13


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484659 entries, 0 to 484658
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      484659 non-null  object
 1   business_id  484659 non-null  object
 2   stars        484659 non-null  object
 3   useful       484659 non-null  int64 
 4   text         484659 non-null  object
 5   date         484659 non-null  object
dtypes: int64(1), object(5)
memory usage: 22.2+ MB


In [27]:
df['date'] = pd.to_datetime(df['date'])
df['text_length'] = df.text.apply(lambda x: len(x))
df['stars'] = df.stars.astype('int')
df['nbr_words'] = df.text.apply(lambda x: len(x.split(' ')))
df.head()

Unnamed: 0,user_id,business_id,stars,useful,text,date,text_length,nbr_words
0,2hRe26HSCAWbFRn5WChK-Q,d4qwVw4PcN-_2mK2o1Ro1g,1,0,10pm on a super bowl Sunday and they're alread...,2015-02-02 06:28:00,102,19
1,RR-2nouBn408e3djxC470g,d4qwVw4PcN-_2mK2o1Ro1g,5,0,Holy heck this place is amazing. I love their ...,2017-02-09 04:25:03,144,24
2,-Co-ReNx_lXT1xL_Rr0B2g,XZbuPXdyA0ZtTu3AzqtQhg,4,0,"As the previous person posted, what more can r...",2009-10-13 09:50:48,2470,476
3,4xIRICDNx33zPG-CYshTXQ,IhNASEZ3XnBHmuuVnWdIwA,5,0,If you are looking for something refreshing an...,2015-07-16 06:46:29,785,139
4,DMtVkV1K2DPimItj9xUfjw,XZbuPXdyA0ZtTu3AzqtQhg,4,1,Bobby Flay's restaurant at Caesar's Palace. T...,2012-11-06 06:00:13,849,146


In [31]:
df_no_zero_useful = df[df.useful>0]
df_no_zero_useful.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 177697 entries, 4 to 484653
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   user_id      177697 non-null  object        
 1   business_id  177697 non-null  object        
 2   stars        177697 non-null  int32         
 3   useful       177697 non-null  int64         
 4   text         177697 non-null  object        
 5   date         177697 non-null  datetime64[ns]
 6   text_length  177697 non-null  int64         
 7   nbr_words    177697 non-null  int64         
dtypes: datetime64[ns](1), int32(1), int64(3), object(3)
memory usage: 11.5+ MB


### Save the review data to a csv file

In [30]:
data_folder = r"C:\Users\lovis\Projects\yelp_causal_inference\data\\"
save_to_file = input(f"Specify the filename you would like to save the current df to: \n{data_folder}  ")
df.to_csv(data_folder + save_to_file)