# Yelp: Reviews Data Processing

*By Daniel Deutsch, José Lucas Barretto, Lucas Miguel Agrizzi, Kevin Kuhl.*

In [1]:
import os
import ast
import pandas as pd

from pandas.io.json import json_normalize

In [2]:
# Constants
URL_DEFAULT_IMG = "https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_styleguide/514f6997a318/assets/img/default_avatars/user_60_square.png"

In [3]:
df = pd.DataFrame()
for filename in [filename for filename in os.listdir("./../datasets/raw_reviews") if filename.endswith(".csv.zip")]:
    df = pd.concat([df, pd.read_csv(f"./../datasets/raw_reviews/{filename}", index_col=0)], ignore_index=True)

### Flatten Important Attributes

Some of the columns are a json string with important values, it is convenient to create a column for each one of these attributes

In [4]:
df = df.join(json_normalize(df["business"].map(ast.literal_eval).tolist()).add_prefix("business."))
df = df.join(json_normalize(df["comment"].map(ast.literal_eval).tolist()).add_prefix("comment."))
df = df.join(json_normalize(df["user"].map(ast.literal_eval).tolist()).add_prefix("user."))

  df = df.join(json_normalize(df["business"].map(ast.literal_eval).tolist()).add_prefix("business."))
  df = df.join(json_normalize(df["comment"].map(ast.literal_eval).tolist()).add_prefix("comment."))
  df = df.join(json_normalize(df["user"].map(ast.literal_eval).tolist()).add_prefix("user."))


### Creating has_img from user.src

In [5]:
df["has_img"] = df["user.src"].apply(lambda attr: attr != URL_DEFAULT_IMG)

### Drop Unnecessary Columns

Some of the columns in the dataframe aren't meaningful for our analysis, so we don't have to keep them.

In [6]:
df.drop(["business", "business.id", "business.name", "business.photoSrc", "comment", "user", "feedback",
         "user.altText", "user.eliteYear", "user.link", "user.markupDisplayName", "user.partnerAlias",
         "user.photoCount", "user.src", "user.srcSet", "user.userUrl", "photosUrl", "localizedDateVisited",
         "businessOwnerReplies", "userId", "previousReviews", "lightboxMediaItems", "photos", "tags", 
         "isUpdated", "appreciatedBy", "id"],
        axis=1, inplace=True)

df = df.rename(columns={'localizedDate': 'date'})

### Save the Dataframe

Saves the obtained dataframe

In [7]:
df.to_csv("./../datasets/proc_reviews/proc_reviews.csv.zip")