# Yelp: Fake Reviews Data Enhencement

*By Daniel Deutsch, José Lucas Barretto, Lucas Miguel Agrizzi, Kevin Kuhl.*

In [1]:
import numpy as np
import pandas as pd
import pycountry
from geopy.geocoders import Nominatim
from langdetect import detect

In [2]:
df = pd.read_csv("./../datasets/raw_freviews/raw_freviews.csv.zip", index_col=0)

### Add Missing Columns

Since the fake reviews were obtained via web scraping (unlike the normal reviews that were obtained through a hidden API), some of the columns couldn't be filled because the page scraped wouldn't provide its information. Now, we are going to add these missing columns by setting a default value.

In [None]:
df["totalPhotos"] = 0

### Extract Country Information

The column *user.displayLocation* of the dataframe is useless the way it is now. What we are going to do is extract the country name and the country ISO aplha_3 code for each row.

In [None]:
def extract_country(row):
    nm = Nominatim(user_agent="myUserAgent")
    country = nm.geocode(row["user.displayLocation"])
    try:
        name = country.address.split(", ")[-1]
        code = pycountry.countries.get(name=name).alpha_3
    except:
        name, code = pd.NA, pd.NA
    print(f"\r Progress: {row.name}, name: {name}, code: {code}", end="")
    return name, code

df[["user.country.name", "user.country.code"]] = df.apply(extract_country, axis=1, result_type="expand")

### Detect the Language of The Comment

In our research its important to know in which language the reviews were written.

In [None]:
def detect_language(row):
    try:
        return detect(row["comment.text"])
    except:
        return pd.NA

df["comment.language"] = df.apply(detect_language, axis=1)

### Drop Rows with Rare Languages

Our analysis will only consider the comments written in english or in french, so we can drop all the other fake reviews.

In [None]:
# Defines accepted languages
accepted_lang = ['en', 'fr']

# Filter the dataframe
df = df[df["comment.language"].isin(accepted_lang)]
df = df.reset_index()

### Drop Unnecessary Columns

Now that we have the information about the country of the user, we can drop the column *user.displayLocation*

In [None]:
df.drop(["user.displayLocation"], axis=1, inplace=True)

### Save the Dataframe

Saves the obtained dataframe

In [None]:
df.to_csv("./../datasets/enh_freviews/enh_freviews.csv.zip")