# Yelp: Reviews Data Ingestion

*By Daniel Deutsch, José Lucas Barretto, Lucas Miguel Agrizzi, Kevin Kuhl.*

In [None]:
import time
import json
import random
import requests
import pandas as pd

from datetime import timedelta

In [None]:
# NordVPN available countries
countries = json.load(open("./../constants/countries.json", encoding="utf-8"))

# Dataframes
df_restaurants = pd.read_csv("./../datasets/enh_restaurants/enh_restaurants.csv.zip", index_col=0)

### Extract Reviews

For each restaurant in the dataset, we scrape its

In [None]:
# Defines the time that the process started
start = time.time()

# Defines the reviews dataframe
df_reviews = pd.DataFrame()

for idx, row in df_restaurants.iterrows():
    for rl in ["en", "fr"]:
    
        # Sets the HTTP request
        url = f"https://www.yelp.com/biz/{row['id']}/review_feed"
        params = {
            "rl": rl,
            "sort_by": "relevance_desc",
            "start": 0
        }

        while True:

            print(f"\rProgress: restaurants {row.name+1}/{df_restaurants.shape[0]}, reviews {df_reviews.shape[0]}, time taken {timedelta(seconds=time.time()-start)}", end="")

            # Makes the HTTP request
            r = requests.get(url, params=params)

            # Good response from the API
            if r.status_code == 200:

                # Obtains the reviews of this page
                reviews = r.json()["reviews"]

                # Still have reviews from this restaurant
                if reviews:
                    df_reviews = pd.concat([pd.DataFrame(reviews), df_reviews], ignore_index=True)
                    df_reviews["is_fake"] = False
                    params["start"] += 20
                
                # Overflow on restaurant's reviews (go to the next restaurant)
                else:
                    break

            # Our IP got blocked from the API
            elif r.status_code == 503:
                country = random.choice(countries)
                ! nordvpn connect {country} # Runs on the terminal
                time.sleep(5)

            # Got an unpredicted response
            else:
                raise Exception("Unpredicted response from the server")

### Save the Dataframe

Saves the obtained dataframe

In [None]:
df_reviews.to_csv("./../datasets/raw_reviews/raw_reviews.csv.zip")