### Yelp data via their API

This data was collected from the yelp fusion api. You can find more information and how to do this yourself [here](https://www.yelp.com/developers/documentation/v3)

I followed [this great tutorial](https://python.gotrained.com/yelp-fusion-api-tutorial/). Good luck!

In [2]:
#for scraping
import requests
import json

#for tabular data
import pandas as pd
import numpy as np

Please note! The cell below has the api_key and headers information. This is not a part of this particular notebook so as to protect the yelp api key used.

Obtaining your own API key is relatively simple, I recommend following the tutorial mentioned above.

In [1]:
df = pd.read_csv('restaurant-scores-lives-standard.csv')
df = df.drop(columns=['business_id', 'business_phone_number', 'business_city', 'business_state', 'business_location', 'inspection_id'])


NameError: name 'pd' is not defined

In [4]:
df.shape

(53732, 17)

In [5]:
df.dropna(inplace=True)

In [6]:
df.shape

(21443, 17)

In [7]:
business_names = df.business_name.unique().tolist()

### Api request to create our businesses df

In [25]:
url = 'https://api.yelp.com/v3/businesses/search'
columns=['name', 'id', 'rating', 'price']
#set up a first dud row
business_df = pd.DataFrame([[np.nan, np.nan, np.nan, np.nan]], columns=columns)

for name in business_names:
    #get our parameters for each value
    location = df.loc[df.business_name == name]['business_address'].reset_index()['business_address'][0]
    latitude = df.loc[df.business_name == name]['business_latitude'].reset_index()['business_latitude'][0]
    longitude = df.loc[df.business_name == name]['business_longitude'].reset_index()['business_longitude'][0]
    #set up our calls
    params = {'term' : name, 'location' : location, 'latitude' : latitude, 'longitude' : longitude}
    
    req = requests.get(url, params=params, headers=headers)
    #send our request
    #we will add each  new entry to temp, then add it to our running df
    temp = []
    if (req.status_code == 200):
        parsed = json.loads(req.text)
        businesses = parsed['businesses']
        #append the values
        try:
            temp.append(businesses[0]['name'])
        except:
            temp.append(np.nan)
        try:
            temp.append(businesses[0]['id'])
        except:
            temp.append(np.nan)
        try:
            temp.append(businesses[0]['rating'])
        except:
            temp.append(np.nan)
        try:
            temp.append(businesses[0]['price'])
        except:
            temp.append(np.nan)
    
    #add to our running df
    try:
        temp_df = pd.DataFrame([temp], columns=columns)
    except:
        temp = [np.nan. np.nan, np.nan, np.nan]
        temp_df = pd.DataFrame([temp], columns=columns)
        
    business_df = pd.concat([business_df, temp_df])

In [26]:
business_df.head(10)

Unnamed: 0,name,id,rating,price
0,,,,
0,Rainbow Grocery,5NvXIkNdCCqUb235WVfMJg,4.0,$$
0,Parada 22,TlBFKt2N2eSEBpN-UZmDBw,4.0,$$
0,Newtree,thrAX79eegx1Of82TCJhrA,4.0,$$
0,Starbucks,C36BK5luxi-8apVMMhsizQ,3.5,$
0,Dojima-Ann,cseyjQ0XIp6dwC0_TcaMOg,3.5,$$
0,Piccino,i2VhtC1JkV_sZOA4urd1ng,4.0,$$
0,Eric's Restaurant,Ux_bs6eZ7WqIsLepTw1uBw,3.5,$$
0,Taiwan Restaurant,eu3UCrfFkTF73F0idXeJ5Q,3.5,$$
0,Java Beach at the Zoo,iRmdKzcbdFLIp3s9e4xrHA,4.0,$


# Here it is!

In [27]:
business_df.to_csv('businesses.csv', index=False)

## Only this call!

In [5]:
business_df = pd.read_csv('businesses.csv')

In [6]:
business_df.dropna(how='all', inplace=True)

## Start getting our reviews!

In [11]:
#package for gleaning some analysis
from textblob import TextBlob
business_ids = business_df['id'].tolist()

In [12]:
#lists to append to our business_df
average_rating = []
average_sentiment = []

review_ids = []
review_text = []

#now time to get some reviews!
for id in business_ids:
    #lists to average and append
    ratings = []
    sentiment = []
    
    #make request and parse data
    url = 'https://api.yelp.com/v3/businesses/{}/reviews'.format(id)
    req = requests.get(url, headers=headers)
    parsed = json.loads(req.text)
    reviews = parsed["reviews"]
    
    #check if our request is good, then loop through reviews
    if (req.status_code == 200):
        for i in range(len(reviews)):
            #append ratings and reviews
            #try:
            ratings.append(reviews[i]['rating'])
            #except:
                #rating.append(np.nan)
            #try:
            testimonial = TextBlob(reviews[i]['text'])
            sentiment.append(testimonial.sentiment.polarity)
                #for our other dataset
            review_ids.append(reviews[i]['id'])
            review_text.append(reviews[i]['text'])
            #except:
                #sentiment.append(np.nan)
            
    #add our averages to our lists
    #try:
    average_rating.append(sum(ratings) / len(ratings))
    #except:
        #average_rating.append(np.nan)
    #try:
    average_sentiment.append(sum(sentiment) / len(sentiment))
    #except:
        #average_sentiment.append(np.nan)

In [13]:
business_df['review_rating'] = average_rating
business_df['review_sentiment'] = average_sentiment

In [14]:
business_df.to_csv('yelp_business.csv', index=False)

In [15]:
business_df.head()

Unnamed: 0,name,id,rating,price,review_rating,review_sentiment
1,Rainbow Grocery,5NvXIkNdCCqUb235WVfMJg,4.0,$$,3.666667,0.080208
2,Parada 22,TlBFKt2N2eSEBpN-UZmDBw,4.0,$$,3.333333,0.106349
3,Newtree,thrAX79eegx1Of82TCJhrA,4.0,$$,4.666667,0.355787
4,Starbucks,C36BK5luxi-8apVMMhsizQ,3.5,$,3.0,0.27031
5,Dojima-Ann,cseyjQ0XIp6dwC0_TcaMOg,3.5,$$,3.666667,0.021528


## Build our text review only dataset

In [9]:
business_df.columns

Index(['name', 'id', 'rating', 'price', 'review_rating', 'review_sentiment'], dtype='object')

In [3]:
business_df = pd.read_csv('yelp_business.csv')

In [17]:
columns= ['name', 'id', 'text']

review_df = pd.DataFrame([[np.nan, np.nan, np.nan]], columns=columns)
for i in range(len(business_df['id'])):
    id = business_df['id'][i]
    name = business_df['name'][i]
    
    url = 'https://api.yelp.com/v3/businesses/{}/reviews'.format(id)
    req = requests.get(url, headers=headers)
    parsed = json.loads(req.text)
    reviews = parsed["reviews"]
    
    if (req.status_code == 200):
        for j in range(len(reviews)):
            temp = []
            
            temp.append(business_df['name'][i])
            temp.append(business_df['id'][i])
            temp.append(reviews[j]['text'])
            
            temp_df = pd.DataFrame([temp], columns=columns)
            review_df = pd.concat([review_df, temp_df])
            

In [18]:
review_df.head()

Unnamed: 0,name,id,text
0,,,
0,Rainbow Grocery,5NvXIkNdCCqUb235WVfMJg,Love love this co-op grocery store! My mom bou...
0,Rainbow Grocery,5NvXIkNdCCqUb235WVfMJg,The produce and selection of products they hav...
0,Rainbow Grocery,5NvXIkNdCCqUb235WVfMJg,This Grocery Store is Vegan and Gluten Free He...
0,Parada 22,TlBFKt2N2eSEBpN-UZmDBw,#pastelon\nSweet #plantains layered dpicadillo...


In [19]:
review_df.to_csv('reviews.csv', index=False)

# Perform a merge to pull in the business_names into our text_review_df

In [None]:
text_review_df.to_csv('text_reviews.csv', index=False)