In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

<h1><center> “Good Dinner AI”</center></h1>
<h3>Use case</h3>

“As a user of Good Dinner, I can:
* post reviews in the form of comments.
* post photos taken in the restaurant.”

“As Good Dinner, I wish to:
* Detect the topics of dissatisfaction in the comments posted on the platform.
* Automatically label photos posted on the platform. For example, identifying photos related to food or decor inside or outside the restaurant.” 

<h3>Scope of the project</h3>
Preliminary study of feature “Detect the topics of dissatisfaction” and “Automatically label posted photos”

<h3>Dataset and Data Collection</h3>

* Problem: Insufficient data available on the Good Dinner platform.
* Solution: Use an existing dataset.
>> * Link to the dataset: https://www.yelp.com/dataset (Contains general information (e.g., type of cuisine) and consumer reviews of different restaurants) 
>> * To fetch new data use the yelp API

<h3> Methodology and process</h3>

* analyze the comments in order to detect topics of dissatisfaction:
>> * pre-process text data
>> * use dimension reduction techniques
>> * visualize high-dimensional data
* analyze photos in order to determine their categories 
>> * pre-process images
>> * use dimension reduction techniques
>> * visualize high-dimensional data
* collect sample collection (approximately 200 restaurants) of data via the Yelp API:
>> * retrieve only the necessary fields
>> * store the results in a usable file format (e.g., csv)
<h3>Tools</h3>

* Python and specialized NLP/CV libraries
* Jupyter Notebook and Voilà package

<h2><center>LOAD DATA </center></h2>

Yelp Fusion API

https://docs.developer.yelp.com/docs/fusion-intro

In [9]:
# Read the business data to merge with reviews
business_json_path = 'C:/Users/spectre/Documents/OpenClassrooms_AI/data/yelp_dataset/yelp_dataset\yelp_academic_dataset_business.json'
df_b = pd.read_json(business_json_path, lines=True)
# Filter on only open businesses 1 = open, 0 = closed
df_b = df_b[df_b['is_open']==1]
# Drop any unnecessary columns
# drop_cols = ['hours','is_open','review_count']
# df_b = df_b.drop(drop_cols, axis=1)
df_b.info()

In [17]:
# Find relevant business categories
df_explode = df_b.assign(categories = df_b.categories
                         .str.split(', ')).explode('categories')
# list out the individual business categories
df_explode.categories.value_counts()
print('Total # of categories: ', len(df_explode.categories.value_counts()))
print('Top 10 Categories: ', df_explode.categories.value_counts()[:10])

#Find the categories containing RV
print('Categories with RV')
df_explode[df_explode.categories.str.contains('RV',
                      case=True,na=False)].categories.value_counts()

Total # of categories:  1302
Top 10 Categories:  categories
Restaurants                  34987
Food                         20419
Shopping                     20186
Home Services                13322
Beauty & Spas                12263
Health & Medical             11046
Local Services               10138
Automotive                    9879
Nightlife                     8379
Event Planning & Services     8173
Name: count, dtype: int64
Categories with RV


categories
RV Repair     103
RV Dealers    103
RV Parks       82
RV Rental      54
Name: count, dtype: int64

In [34]:
# Filter business dataset on restaurants
df_res = df_explode[df_explode['categories']=='Restaurants']
# df_res.info()

In [33]:
'''https://towardsdatascience.com/converting-yelp-dataset-to-csv-using-pandas-2a4c8f03bd88
https://towardsdatascience.com/load-yelp-reviews-or-other-huge-json-files-with-ease-ad804c2f1537'''
# There are multiple chunks to be read
# %%time
chunk_list = []
# read the review dataset in chunks of size 10000
size = 10000
r_dtypes = {'review_id':str,'user_id':str,
            'business_id':str,'stars':int,
            'date':str,'text':str,'useful':int,
            'funny':int,'cool':int}

with open('C:/Users/spectre/Documents/OpenClassrooms_AI/data/yelp_dataset/yelp_dataset/yelp_academic_dataset_review.json', "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=r_dtypes, chunksize=size)
    
    # use  loop to pick only relevant cols and filter data on date
    for chunk_review in reader:
        # Drop columns that aren't needed
        chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)\
                             .query("`date` >= '2017-12-01'")
        
        # Renaming column name to avoid conflict with business overall star rating
        chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
        
        # Inner merge with edited business file so only reviews related to the business remain
        chunk_merged = pd.merge(df_res, chunk_review, on='business_id', how='inner')
        
        # Show feedback on progress
        print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
        chunk_list.append(chunk_merged)

# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

880 out of 10,000 related reviews
910 out of 10,000 related reviews
945 out of 10,000 related reviews
828 out of 10,000 related reviews
910 out of 10,000 related reviews
885 out of 10,000 related reviews
871 out of 10,000 related reviews
845 out of 10,000 related reviews
848 out of 10,000 related reviews
890 out of 10,000 related reviews
872 out of 10,000 related reviews
840 out of 10,000 related reviews
821 out of 10,000 related reviews
835 out of 10,000 related reviews
840 out of 10,000 related reviews
870 out of 10,000 related reviews
794 out of 10,000 related reviews
848 out of 10,000 related reviews
854 out of 10,000 related reviews
780 out of 10,000 related reviews
858 out of 10,000 related reviews
869 out of 10,000 related reviews
977 out of 10,000 related reviews
901 out of 10,000 related reviews
1012 out of 10,000 related reviews
1161 out of 10,000 related reviews
1277 out of 10,000 related reviews
1467 out of 10,000 related reviews
1640 out of 10,000 related reviews
1904 out 

In [35]:
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,user_id,review_stars,text,date
0,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.95,-75.14,4.0,"{'RestaurantsReservations': 'True', 'Restauran...",Restaurants,Qsk0aTclam9W_DIK6bx42A,5,Stopped in to check out this new spot around t...,2017-12-16 00:13:06
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.95,-75.14,4.0,"{'RestaurantsReservations': 'True', 'Restauran...",Restaurants,TJW1aEzjhaxbD10fjhokfQ,1,I live in the neighborhood and used to order a...,2018-04-28 00:46:05
2,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.95,-75.14,4.0,"{'RestaurantsReservations': 'True', 'Restauran...",Restaurants,PYaXWwacBhNPVtXEZ1j-_w,5,We came here tonight just for a date night. We...,2018-01-10 02:56:31
3,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.95,-75.14,4.0,"{'RestaurantsReservations': 'True', 'Restauran...",Restaurants,T9b7lYhNTAr7PyL4x8mmIw,5,Wow! What a great dining adventure! Huge rolls...,2017-12-29 23:57:16
4,kV_Q1oqis8Qli8dUoGpTyQ,Ardmore Pizza,10 Rittenhouse Pl,Ardmore,PA,19003,40.01,-75.29,3.5,"{'RestaurantsGoodForGroups': 'True', 'WiFi': '...",Restaurants,XOa3fuOw90GWvghLHyTQYA,2,Bunch of high school/college kids running the ...,2018-08-05 00:10:12


In [6]:
import spacy

# import the model
import en_core_web_sm

ModuleNotFoundError: No module named 'en_core_web_sm'

In [8]:
# Import dependencies
from splinter import Browser
from bs4 import BeautifulSoup

# Launch the browser
browser = Browser('edge')

In [4]:
# Set the URL to visit to a variable
url = 'https://www.yelp.com/dataset/download/'

# Send the browser to the URL
browser.visit(url)

# Save the HTML from the browser
html = browser.html

# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html, 'html.parser')

In [5]:
# Find all divs that contain a question
question_divs = soup.find_all('div', class_="s-post-summary js-post-summary")

In [7]:
question_divs

[]

In [6]:
# Find the summary text of the first question
question_divs[0].find("a", class_="s-link").text

IndexError: list index out of range

In [None]:
# Close the browser
browser.quit()

In [3]:
yelp_batch = []
var_dtypes = {"stars": np.float16, 
            "useful": np.int32, 
            "funny": np.int32,
            "cool": np.int32,
           }
with open("https://www.yelp.com/dataset/download/yelp_academic_dataset_review.json", "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=var_dtypes, chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=['review_id', 'user_id'])\
                             .query("`date` >= '2017-12-01'")
        yelp_batch.append(reduced_chunk)
    
yelp_batch = pd.concat(yelp_batch, ignore_index=True)

OSError: [Errno 22] Invalid argument: 'https://www.yelp.com/dataset/download/yelp_academic_dataset_review.json'