# Yelp: Fake Review Data Ingestion

*By Daniel Deutsch, José Lucas Barretto, Lucas Miguel Agrizzi, Kevin Kuhl.*

In [None]:
import requests
import pandas as pd

from bs4 import BeautifulSoup

In [None]:
# Constants
URL_DEFAULT_IMG = "https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_styleguide/514f6997a318/assets/img/default_avatars/user_60_square.png"

# NordVPN possible countries
countries = json.load(open("./constants/countries.json", encoding="utf-8"))

In [None]:
df_restaurants = pd.read_csv("./../datasets/enh_restaurants/enh_restaurants.csv", index_col=0)

### Obtains the Fake Reviews

Obtains the fake reviews for each restaurant

In [None]:
# Create an empty dataframe to store reviews
df_reviews = pd.DataFrame(columns=['user.displayLocation', 'user.friendCount', 'user.reviewCount', 'is_fake', 'date', 'business.alias', 'comment.text', 'rating', 'has_img'])

# Creating useful variables
row_count = df.shape[0]
progress = 0
total_reviews = 0

# Iterating over each row of the dataframe of restaurants
for index, row in df_restaurants.iterrows():
    iterator = 0
    less10 = 0
    # Getting the number of fake reviews for the current restaurant
    number_fake_reviews = int(row["freview_count"])
    
    # Check if there are less than 10 reviews (all reviews would be in a single page)
    if number_fake_reviews < 10:
        less10 = 1
        
    # Update progress counter if the restaurant has no fake reviews (and do not enter the while loop below)
    if number_fake_reviews == 0:
        progress += 1
        
    # Builds the url of the restaurant
    url = f"https://www.yelp.fr/not_recommended_reviews/{row['alias']}"

    # Tries to get the fake reviews
    # Inside a while loop to treat all possible exceptions
    while(number_fake_reviews != 0):
        try:
            # Do the request
            r = requests.get(url)
            # Parse the reponse from the request
            soup = BeautifulSoup(r.text, 'html.parser')
            # Get all the <li> elements from the page (those contains the reviews)
            reviews_in_page = soup.find_all("ul")[0].find_all("li")
            
            # Iterate over the first 10 fake reviews of the current analysed restaurant
            for i in range(10 if not less10 else number_fake_reviews):
                # First, we analyse if there is a icon for posted photos by the user
                # This changes the way we iterate between one review and another
                # As it adds an extra <li> block to be counted
                # It indicates how many photos the user has already posted in Yelp
                reviews_have_photos = True if len(reviews_in_page[iterator].find_all(class_="photo-count responsive-small-display-inline-block"))>0 else False
                # Initializing an empty dictionary to contain the review
                review = {}
                # First filling it with the user's origin
                review["user.displayLocation"] = reviews_in_page[iterator].find_all("b")[0].text
                # Sometimes Yelp uses Membre Qype and Membre Cityvox in the alias of a given user
                # When this is the case, we have to ignore the first response for the user origin and
                # Start capturing from the second entry (<b> blocks)
                if review["user.displayLocation"] == "Membre Qype" or review["user.displayLocation"]=="Membre Cityvox":
                    review["user.displayLocation"] = reviews_in_page[iterator].find_all("b")[1].text
                    review["user.friendCount"] = reviews_in_page[iterator].find_all("b")[2].text
                    review["user.reviewCount"] = reviews_in_page[iterator].find_all("b")[3].text
                else:
                    review["user.friendCount"] = reviews_in_page[iterator].find_all("b")[1].text
                    review["user.reviewCount"] = reviews_in_page[iterator].find_all("b")[2].text
                # Indicates this is a fake review (to use when comparing with real reviews)
                review["is_fake"] = True
                review["date"] = reviews_in_page[iterator].find_all("span", class_="rating-qualifier")[0].text.strip()
                review["business.alias"] = f"{row['alias']}"
                review["comment.text"] = reviews_in_page[iterator].find_all("p")[0].text
                review["rating"] = reviews_in_page[iterator].find_all("img", class_="offscreen")[0].attrs["alt"].split(" ")[0]
                review["has_img"] = reviews_in_page[iterator].find_all("img", class_="offscreen")[0].attrs["src"] != URL_DEFAULT_IMG
                # Add it to the dataframe
                df_reviews = df_reviews.append(review.copy(), ignore_index=True)
                # Update the iterator
                if reviews_have_photos:
                    iterator += 6
                else:
                    iterator += 5
        # This never happens, but we wanted to make a stable program
        except KeyError:
            progress+=1
            break
        # All possible exceptions will be related to conections to Yelp's website
        # We solve them by changing to another country
        except Exception:
            country = random.choice(countries)
            # Runs on the terminal
            ! nordvpn connect {country}
            time.sleep(5)
        # If there is no exceptions, we update the progress counter
        # Also updating the total number of reviews retrieved and printing the status
        else:
            progress += 1
            total_reviews += 10 if not less10 else number_fake_reviews
            print(f"\rProgress: row {progress}/{row_count}, accessing from: {country}, total reviews: {total_reviews}", end="")
            break

### Save the Dataframe

Saves the obtained dataframe

In [None]:
df_reviews.to_csv("./../datasets/raw_freviews/raw_freviews.csv.zip")