In [None]:
import urllib.request
import json
import time
import datetime
import random
import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example

A helper function for repeating http requests until they succeed.

In [None]:
def request_data_from_url(url):
    """A utility function for making http requests and repeating them until they succeed 
    Args:
        url (str): url for which to make a request
    Returns: 
        http.client.HTTPResponse: an object which can work as a context manager and has at least the properties url, headers, and status.
    """
    success = False
    while success is False:
        try: 
            response = urllib.request.urlopen(url)
            
            # 200 is the HTTP response code for success
            if response.getcode() == 200:
                success = True
        except Exception as e:
            print(e)
            time.sleep(1)
            print("Error for URL %s: %s" % (url, datetime.datetime.now()))
            print("Retrying...")

    return response.read()

A function which given geographic coordinates and type of place fetches a list of places near those coordinates (in a radius of 1000 meters) and for each place it fetches its rating and first 5 reviews (the API returns just 5 by default).

In [None]:
def scrape_google_maps_data(api_key, location, place_type):
    """A utility function for getting a list of places and corresponding reviews with given type aroung given coordinates
    Args:
        api_key (str): an authentication key for the API of Google Maps
        location (str): the geographical coordinates around which to search for places
        place_type (str): the type of places to search for
    Returns:
        dictionary: the keys are the names of the places and the values are pairs of their numeric rating (number of stars) in Google Maps and a list containing the text of some of their reviews
    """
    url = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?'
    location_param = 'location=%s&' % (location)
    radius_param = 'radius=1000&'
    type_param = 'type=%s&' % (place_type)
    key = 'key=%s' % (api_key)
    
    request_url = url + location_param + radius_param + type_param + key
    
    places_list = json.loads(request_data_from_url(request_url))
    
    result = {} 
    for place in places_list['results']:
        details_url = 'https://maps.googleapis.com/maps/api/place/details/json?'
        place_name = place['name'];
        place_id_param = 'place_id=%s&' % (place['place_id'])
        language_param = 'language=en&'
        fields_param = 'fields=rating,reviews&'
        details_request_url = details_url + place_id_param + language_param + fields_param + key
        place_details = json.loads(request_data_from_url(details_request_url))
        result[place['name']] = (place_details['result']['rating'], [review['text'] for review in place_details['result']['reviews']])
    
    #print(result)
    
    return result

The pretrained models provided by spaCy, especially the small ones, don't recognized many foods' category of products, but just as nouns. So we need to extend the model with new examples. The next cells contains code and example data for retraining a preloaded model.

In [None]:
# Data and procedure for retraining the model to recognise additional product types.
TRAIN_DATA = [
    ("I love pizza", {"entities": [(7, 12, "PRODUCT")]}),
    ("Let's get some ice cream.", {"entities": [(15, 24, "PRODUCT")]}),
    ("Let's go to the restaurant.", {"entities": [(16, 26, "PRODUCT")]}),
    ("Good food.", {"entities": [(5, 9, "PRODUCT")]}),
    ("drinking", {"entities": [(0, 8, "PRODUCT")]}),
    ("Good food.", {"entities": [(5, 9, "PRODUCT")]}),
]

def train(nlp, n_iter=100):
    """A function which retrains the given model to recognize more words for food as belonging to the PRODUCT category.
    Args:
        nlp (Language object): existing spaCy pipeline with a loaded pretrained model which sholed be trainned additionaly
        n_iter (int): number of itterations for retraining
    Returns: 
        None
    """
    # Creates and adds the pipeline component necessary for the training
    # nlp.create_pipe works only for built-in components of spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        # Just it them if it already exists
        ner = nlp.get_pipe("ner")

    # Add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # Disable pipelines, which are not necessary for the training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        examples = []
        for text, annots in TRAIN_DATA:
            examples.append(Example.from_dict(nlp.make_doc(text), annots))
        for itn in range(n_iter):
            random.shuffle(examples)
            losses = {}
            # pack the examples using spaCy minibatch
            for batch in minibatch(examples, size=compounding(4.0, 32.0, 1.001)):
                nlp.update(
                    batch,
                    drop=0.5,
                    losses=losses,
                )
#            print("Losses", losses)

The next cell contains a list of superlatives which we will use for the scoring and their corresponding scores as well as some sample reviews for testing without the Google Maps API key.

In [None]:
# Superlatives used for scoring:
scores = {
    "terrible" : -3,
    "awful" : -2,
    "bad" : -1,
    "reasonable" : 0,
    "average" : 0,
    "good" : 1,
    "excellent" : 2,
    "delicious" : 2,
    "wonderful" : 2,
    "perfect" : 3,
    "superb" : 3
}
# Extracted sample reviews for testing without Google Maps API key:
example_reviews = {
    "Beykoz":
        (4.4, [
            "Good range of food and drinks from pizza, to kebabs, wine, beer and ice cream.",
            "Amazing choice of wonderful food from anywhere you can imagine. Lots of seating, mostly sharing tables. Great atmosphere. Shop too, freshly baked bread, croissants, and all kinds of veg and other produce. Toilets clean and adequate but not best I've seen. I'll definitely go back if in the area.",
            "Amongst the best food courts in London. Super spacious and a huge variety of delicious food and drink options across the site.",
            "Really great atmosphere and vibes, lots of seats and food stalls with diverse cuisine to choose from! Great place for a big group, or a fun date.",
            "Ok so. Let's review Beykoz. Probably you hear the name Beykoz, but this restaurant is not that one. Near Mogan lake, this restaurant is above average which come to mind with excellent service and steaks. In fact Beykoz is not only a restaurant that you can find good steak but also excellent drinking.",
            "Excellent steaks! Good prices.. The ambience is perfect for a friend's night out!!"]
        )
}


A function which given some text extracts pairs of foods and the adjectives which descibe them directly.

In [None]:
def extract_food_relations(doc):
    """A function which given some text extracts pairs of foods and the adjectives which descibe them directly.
    Args:
        doc (a spaCy Doc objec): the result of running dependency parsing with spaCy on some text
    Returns: 
        list: a list of pairs (adjective, noun) where the noun is a type of product аnd the adjective is describing it
    """
    # Merge entities и noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    # Remove overlaping phrases    
    spans = spacy.util.filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)

    relations = []
    for food in filter(lambda w: w.ent_type_ == "PRODUCT", doc):
        if food.dep_ in {"pobj", "ROOT"} and food.head.dep_ in {"adj", "prep", "ROOT"}:
            relations.append((food.head.head, food))
    return relations

The main function for extracting a numeric score form the review text. It consumes the dictionary which is returned by the function scrape_google_maps_data().

In [None]:
def calculate_scores(places, model="en_core_web_sm"):
    """The main function for extracting a numeric score form the review text. It consumes the dictionary which is returned by the function scrape_google_maps_data().
    Args:
        places (dictionary): a dictionary with object and their reviews fetched from the Google Maps API by using scrape_google_maps_data()
        model (str): the name of the model to be loaded into spaCy
    Returns: 
        None
    """
    nlp = spacy.load(model)
    print("The model '%s' is loaded into spaCy." % model)
    train(nlp)
    print("The model '%s' is retrained." % model)
    print("Processing %d places." % len(places))

    for name, reviews in places.items():
        calculated_score = 0
        num_superlatives = 0
        sum_scores = 0
        for text in reviews[1]:
            doc = nlp(text)
            relations = extract_food_relations(doc)
            for r1, r2 in relations:
                for word, score in scores.items():
                    if r1.text.lower().find(word) != -1:
                        sum_scores += score
                        num_superlatives += 1
        if num_superlatives != 0:
            calculated_score = sum_scores / num_superlatives
        print("place: %s - Google Maps raiting %s, calculated raiting %s" % (name, reviews[0], calculated_score))

Using the Google Maps API requires setting up and account and generating a special key, which is to be included in each request. Enabling the key requires adding a credit card for billing (even when staing within the free quota).
The key included here is no longer active, so for testing there is a commented-out call using some hard-coded data located in one of the cells above.

In [None]:
api_key = 'AIzaSyBsjuNsm0h04JJ_d7i3I1gYsMsYcslzQf8'
location = '51.50732630711219,-0.12773362936048624'
place_type = 'restaurant'

places = scrape_google_maps_data(api_key, location, place_type)
calculate_scores(places)
# calculate_scores(example_reviews)