# Yelp: Restaurants Data Processing

*By Daniel Deutsch, José Lucas Barretto, Lucas Miguel Agrizzi, Kevin Kuhl.*

In [93]:
import os
import ast
import pandas as pd

from pandas.io.json import json_normalize

In [94]:
df = pd.read_csv("./../datasets/raw_restaurants/raw_restaurants.csv.zip", index_col=0)

### Flatten Important Attributes

The coordinates column is a dict with keys "latitude" and "longitude". It's harder to work with columns that are dicts in pandas. To avoid that, we create new columns based on these attributes.

In [95]:
df = df.join(json_normalize(df["coordinates"].map(ast.literal_eval).tolist()).add_prefix("coordinates."))

  """Entry point for launching an IPython kernel.


### Numericalizing Strings

The price is proportional to the amount of $\$$ returned by the API. It is harder to work with strings in this context, so we set the price column as the number of $\$$ returned by the API instead of a string with $\$$.

In [96]:
def numericalize(row):
    return len(str(row["price"]))

df["price"] = df.apply(lambda row: numericalize(row), axis=1)

### Drop Unnecessary Categories Attributes

The categories column is an array of dictionaries. These dictionaries have "alias" and "title" as their keys. We will only use the "alias", so we can make the column be an array of alias instead an array of dictionaries.

In [97]:
df["categories"] = df["categories"].map(lambda row: [category["alias"] for category in ast.literal_eval(row)])
df = df.join(json_normalize(df["location"].map(ast.literal_eval).tolist()).add_prefix("location."))

  


### Generate Arrondissement from Zip Code

In [98]:
def get_ar(zip_code_str):
    
    if zip_code_str == '':
        return pd.NA
    
    zip_code_int = int(zip_code_str)
    
    if zip_code_int in range(75001, 75021):
        return str(int(zip_code_int - 75000))
    else:
        return pd.NA
        
df['arrondissement'] = df['location.zip_code'].apply(get_ar)

### Drop Unnecessary Columns

Some of the business details returned by Yelp's API are not interesting for our goal. Therefore, there is no problem in dropping these columns

In [99]:
df.drop(["phone", "display_phone", "distance", "image_url", "transactions", "location", "coordinates", "url", "location.address1", "location.address2", "location.address3", "location.city", "location.country", "location.state", "location.display_address", "location.zip_code"], axis=1, inplace=True)

### Save the Dataframe

Saves the obtained dataframe

In [100]:
df.to_csv("./../datasets/proc_restaurants/proc_restaurants.csv.zip")