# Yelp: Reviews Data Enhencement

*By Daniel Deutsch, José Lucas Barretto, Lucas Miguel Agrizzi, Kevin Kuhl.*

In [None]:
import numpy as np
import pandas as pd
import pycountry
from geopy.geocoders import Nominatim
import re

In [None]:
df = pd.read_csv("./../datasets/proc_reviews/proc_reviews.csv.zip", index_col=0)

### Extract Country Information

The column *user.displayLocation* of the dataframe is useless the way it is now. What we are going to do is extract the country name and the country ISO aplha_3 code for each row.

In [None]:
def extract_country(row):
    nm = Nominatim(user_agent="myUserAgent")
    country = nm.geocode(row["user.displayLocation"])
    try:
        name = country.address.split(", ")[-1]
        code = pycountry.countries.get(name=name).alpha_3
    except:
        name, code = pd.NA, pd.NA
    print(f"\r Progress: {row.name}, name: {name}, code: {code}", end="")
    return name, code

df[["user.country.name", "user.country.code"]] = df.apply(extract_country, axis=1, result_type="expand")

### Drop Unnecessary Columns

Now that we have the information about the country of the user, we can drop the column *user.displayLocation*

In [None]:
df.drop(["user.displayLocation"], axis=1, inplace=True)

### Remove HTML tags from text

In [None]:
regex_pat = re.compile('<.*?>')
df["comment.text"] = df["comment.text"].str.replace({regex_pat: "", "&#39;": ""}, regex=True)

### Save the Dataframe

Saves the obtained dataframe

In [None]:
df.to_csv("./../datasets/enh_reviews/enh_reviews.csv.zip")