# Yelp: Restaurants Data Ingestion

*By Daniel Deutsch, José Lucas Barretto, Lucas Miguel Agrizzi, Kevin Kuhl.*

In [None]:
import time
import json
import requests
import pandas as pd

from datetime import timedelta

In [None]:
# Constants
ARRONDISSEMENTS = json.load(open("./../constants/arrondissements.json", encoding="utf-8"))
CATEGORIES = json.load(open("./../constants/categories.json", encoding="utf-8"))

# API authentication
AUTHS_API = json.load(open("./../constants/auth-api.json", encoding="utf-8")) 
available_auths_api = json.load(open("./../constants/auth-api.json", encoding="utf-8"))

### Category Parser

The categories JSON provided from Yelp's site has some information we don't actually need. To make it easier for us to work with it, we will transform this variable into an array of categories ailias.

In [None]:
categories_alias = [category["alias"] for category in CATEGORIES if "restaurants" in category["parents"]]

### Business Extraction

Uses the Yelp API to gather all the possible restaurants in Paris. Since Paris is known to have more than 45,000 restaurants and the API only returns the first 1000 results given the parameters (with 50 results per page), we had to play with the request params to be able to gather as many restaurants as possible.

In [None]:
# Sets the HTTP request
url = "https://api.yelp.com/v3/businesses/search"
headers = { "Authorization": f"Bearer {random.choice(available_auths_api)['api_key']}" }
params = {
    "term": "restaurants",
    "sort_by": "distance",
    "open_now": False, 
    "offset": 0,
    "limit": 50, 
    "location": "",
    "categories": ""
}

# Defines the time that the process started
start = time.time()

# Defines the dataframe of obtained restaurants
df = pd.DataFrame()

# Appends the restaurants to the dataframe
for idx_arr, arr in enumerate(ARRONDISSEMENTS):

    params["location"] = f"{arr}, Paris"

    for idx_cat, category in enumerate(categories_alias):        
        
        params["categories"] = category
        params["offset"] = 0

        while True:

            print(f"\rProgress: arrondissement {idx_arr+1}/{len(ARRONDISSEMENTS)}, category {idx_cat+1}/{len(categories_alias)}, available api accounts {len(available_auths_api)}/{len(AUTHS_API)}, restaurants {df.shape[0]}, time taken {timedelta(seconds=time.time()-start)}", end="")

            r = requests.get(url, headers=headers, params=params)
            status_code = r.status_code
            r = r.json()

            # Overflow on the amount of requests allowed per day
            if status_code == 429:
                available_auths_api = [keys for keys in available_auths_api if keys["api_key"] != headers["Authorization"].split(" ")[1]]
                if not available_auths_api:
                    raise Exception("Out of valid api accounts for today")
                headers = { "Authorization": f"Bearer {random.choice(available_auths_api)['api_key']}" }

            # Overflow on the limit (1000)
            elif r.get("error"):
                break
            
            # Overflow on the amount of results
            elif params["offset"] > r["total"]:
                break
            
            # Got the results
            else:
                df = pd.concat([pd.DataFrame(r["businesses"]), df], ignore_index=True)
                params["offset"] += params["limit"]

### Drop duplicated rows

Drops row that have the same informations

In [None]:
df = df.drop_duplicates(subset=["id"], ignore_index=True)

### Save the Dataframe

Saves the obtained dataframe

In [None]:
df.to_csv("./../datasets/raw_restaurants/raw_restaurants.csv.zip")