
# <center><span style='background:yellow'>Améliorez le produit IA de votre start-up (partie 1)</span></center>
## <center><span style='background:yellow'>Collecte des données</span></center>
=========================================================================================================================

In [1]:
with open("yelpkey.txt") as f:
    api_key = f.readline()

In [2]:
import pandas as pd

from gql.transport.requests import RequestsHTTPTransport
from gql import Client, gql

#Fonction qui collecte les id et noms des restaurants
def get_business(city):
    """
        Attribus: city(object)
        Return : Dataframe contient les id et le nom des retaurants
    """
    
    endpoint = 'https://api.yelp.com/v3/graphql'
    headers = {'Authorization': 'Bearer ' +  api_key}
    location = {'city' : city}
    
    #Build the request framework
    transport = RequestsHTTPTransport(url=endpoint, headers=headers, use_json=True)

    #Create the client
    client = Client(transport=transport, fetch_schema_from_transport=True)
    
    #Create query
    query = gql(''' query($city: String!)
    {
      search (term : "restaurant", location : $city , limit : 50) {
        business{
          id
          name
        }
      }
    }
    ''')
    response = client.execute(query, variable_values=location)
    
    return pd.DataFrame(response["search"]["business"])



#Fonction qui collecte les commentaires 
def get_reviews(identifiant, name):
    """
        Attribus: identifiant(object)
        Return : Dataframe contient les id et les commentaires des clients
    """
    endpoint = 'https://api.yelp.com/v3/graphql'
    headers = {'Authorization': 'Bearer ' +  api_key}
    business_id = {'Id' : identifiant}
    
    #Build the request framework
    transport = RequestsHTTPTransport(url=endpoint, headers=headers, use_json=True)

    #Create the client
    client = Client(transport=transport, fetch_schema_from_transport=True)
    
    #Create query
    query = gql(''' query($Id: String!)
    {
      reviews (business : $Id) {
        review{
          id
          rating
          text
        }
      }
    }
    ''')
    try : 
        response = client.execute(query, variable_values=business_id)
        reviews = pd.DataFrame(response['reviews']['review']) #créer le dataframe des reviews
        reviews = reviews.assign(business_id = identifiant) #rajouter le business id au dataframe
        reviews = reviews.assign(business_name = name) #rajouter le business name au dataframe
    
    except : 
        print('No review found')
        reviews = None
    
    return reviews   

In [3]:
#test de la fonction get_business
get_business('paris').head()

Unnamed: 0,id,name
0,-0iLH7iQNYtoURciDpJf6w,Le Comptoir de la Gastronomie
1,IU9_wVOGBKjfqTTpAXpKcQ,Bistro des Augustins
2,cEjF41ZQB8-SST8cd3EsEw,L'Avant Comptoir
3,KggnM_Z4wOa_JExunaaWHg,Le Temps des Cerises
4,-umFmobUgpW_05m_ud1vHw,La Cordonnerie


In [4]:
#test de la fonction get_reviews
get_reviews(identifiant='-0iLH7iQNYtoURciDpJf6w', name='Le Comptoir de la Gastronomie').head()

Unnamed: 0,id,rating,text,business_id,business_name
0,zi9OVQcGzeHJ6_T6jRrwkQ,5,"If you like all things duck, then this is your...",-0iLH7iQNYtoURciDpJf6w,Le Comptoir de la Gastronomie
1,ZwzpceATTrUM7Brv90_usg,5,"If you like foie gras and duck, then you will ...",-0iLH7iQNYtoURciDpJf6w,Le Comptoir de la Gastronomie
2,iJo5H5B0cLwzCY2wVS-VOw,5,foie gras heaven! the duck foie gras ravioli i...,-0iLH7iQNYtoURciDpJf6w,Le Comptoir de la Gastronomie


In [5]:
#Recupération des restaurants de Paris
cities = ["paris6", "paris3", "paris8", "paris19", "paris1"]
data_business_cities = get_business(cities[0])
for city in cities[1:]:
    data_business_city = get_business(city)
    data_business_cities = pd.concat([data_business_cities, data_business_city], axis=0, ignore_index=True)    
data_business_cities.head()

Unnamed: 0,id,name
0,IU9_wVOGBKjfqTTpAXpKcQ,Bistro des Augustins
1,-0iLH7iQNYtoURciDpJf6w,Le Comptoir de la Gastronomie
2,cEjF41ZQB8-SST8cd3EsEw,L'Avant Comptoir
3,_MNAGLsoe_vihaXk6tpffw,Le Bistrot d'Henri
4,K_GEE-rokTt1u1OORgu1tw,La Cuisine de Philippe


In [6]:
#supprimer les identifiants dupliqués
data_business_cities.drop_duplicates(subset='id', inplace=True)
print('Le nombre des identifiants des restaurants collectés est :', data_business_cities.shape[0])

Le nombre des identifiants des restaurants collectés est : 222


In [7]:
#Recuperation des commentaires sur les restaurants collectés
data_business_cities = data_business_cities.reset_index(drop=True)

for i in range(0,data_business_cities.shape[0]):
    business_id = data_business_cities.loc[i,'id']
    business_name = data_business_cities.loc[i,'name']
    
    if i == 0:
        data_reviews = get_reviews(business_id, business_name)
        
    else :
        data_review = get_reviews(business_id, business_name)
        if data_reviews is not None :
            data_reviews = pd.concat([data_reviews, data_review], axis=0, ignore_index=True)

No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found
No review found


In [8]:
data_reviews.head(5)

Unnamed: 0,id,rating,text,business_id,business_name
0,hoE3LQ5hS2HAow7BwkexZw,5,Great location and classic food. \n\nThey are ...,IU9_wVOGBKjfqTTpAXpKcQ,Bistro des Augustins
1,DvKcp94DLKhdKVB_jSb_og,1,I am so bummed we wasted a meal here during ou...,IU9_wVOGBKjfqTTpAXpKcQ,Bistro des Augustins
2,pjgcPURNS2PvAQ6a0LOShw,5,"You cannot beat this spot for food, drinks, an...",IU9_wVOGBKjfqTTpAXpKcQ,Bistro des Augustins
3,zi9OVQcGzeHJ6_T6jRrwkQ,5,"If you like all things duck, then this is your...",-0iLH7iQNYtoURciDpJf6w,Le Comptoir de la Gastronomie
4,ZwzpceATTrUM7Brv90_usg,5,"If you like foie gras and duck, then you will ...",-0iLH7iQNYtoURciDpJf6w,Le Comptoir de la Gastronomie


In [9]:
#réorganiser l'ordre des colonnes
data_reviews = data_reviews.reindex(columns=['id','business_id','business_name','rating','text'])
data_reviews.head()

Unnamed: 0,id,business_id,business_name,rating,text
0,hoE3LQ5hS2HAow7BwkexZw,IU9_wVOGBKjfqTTpAXpKcQ,Bistro des Augustins,5,Great location and classic food. \n\nThey are ...
1,DvKcp94DLKhdKVB_jSb_og,IU9_wVOGBKjfqTTpAXpKcQ,Bistro des Augustins,1,I am so bummed we wasted a meal here during ou...
2,pjgcPURNS2PvAQ6a0LOShw,IU9_wVOGBKjfqTTpAXpKcQ,Bistro des Augustins,5,"You cannot beat this spot for food, drinks, an..."
3,zi9OVQcGzeHJ6_T6jRrwkQ,-0iLH7iQNYtoURciDpJf6w,Le Comptoir de la Gastronomie,5,"If you like all things duck, then this is your..."
4,ZwzpceATTrUM7Brv90_usg,-0iLH7iQNYtoURciDpJf6w,Le Comptoir de la Gastronomie,5,"If you like foie gras and duck, then you will ..."


In [10]:
print('Le nombre des commentaires collectés est', data_reviews.shape)
print(f'On a pu obtenir que {data_reviews.business_id.value_counts().max()} commentaires au maximum par restaurant')

Le nombre des commentaires collectés est (538, 5)
On a pu obtenir que 3 commentaires au maximum par restaurant


In [11]:
    #sauvegarde de tous les commentaires 
    data_reviews.to_csv('all_reviews.csv', sep=';', index=False, header=1)

In [12]:
#extraire les commantaires négatifs
data_bad_reviews = data_reviews.loc[data_reviews.rating < 3]
data_bad_reviews.head()

Unnamed: 0,id,business_id,business_name,rating,text
1,DvKcp94DLKhdKVB_jSb_og,IU9_wVOGBKjfqTTpAXpKcQ,Bistro des Augustins,1,I am so bummed we wasted a meal here during ou...
8,8gjKndXw9DKcqKnFWX19qw,cEjF41ZQB8-SST8cd3EsEw,L'Avant Comptoir,1,"This place is SO overrated, probably the worst..."
23,5Tt7q2-VeSnemmcPbrgEpg,idsxibpS8-MiaGil3WsMjQ,Au Petit Suisse,1,"This was, by far, our worst meal in Paris. So ..."
68,DyzRP5PMxw_L1MRAl40PhQ,ctP4c3mwVO5oOzLI48LtuQ,Les Antiquaires,1,What a joke of a place. Would review the food ...
80,2AdyqV2xqHPMyzQosDui1g,arT8b1fhGKwMzZ4WsaXUww,Eggs & Co,2,Decent restaurant but very packed and very sma...


In [13]:
print('Le nombre des commentaires négatifs collectés est :', data_bad_reviews.shape)
print(f'On a pu obtenir que {data_bad_reviews.business_id.value_counts().max()} commentaires au maximum par restaurant')

Le nombre des commentaires négatifs collectés est : (24, 5)
On a pu obtenir que 2 commentaires au maximum par restaurant


En collectant les données via l'API Yelp, on obtient que quelques commentaires négatives dont le score est égale 1 ou 2, cela est très peu pour analyser les textes donc on va utliser les données sur le lien suivant : https://www.yelp.com/dataset pour notre analyse.

In [14]:
    #sauvegarde des commentaires négatives
    data_bad_reviews.to_csv('MODESTE_Khadija_1_csv_102022.csv', sep=';', index=False, header=1)