In [9]:
!pip install python-dotenv requests



In [10]:
import dotenv
import os
import json
import requests
import csv
from itertools import chain
from pprint import pprint

dotenv.load_dotenv()

True

An Example of a review object:

```json
{'authorAttribution': {'displayName': 'Henk Vyncke',
                       'photoUri': 'https://lh3.googleusercontent.com/a-/ALV-UjWCd4R3VchPubvMkIA1_mlgDEPMEWrIbv9mprhaMZAbDjChvTtS=s128-c0x00000000-cc-rp-mo-ba4',
                       'uri': 'https://www.google.com/maps/contrib/103733968569168024105/reviews'},
 'name': 'places/ChIJpXSgrsDbfkcRzf_5kCMmrZI/reviews/ChdDSUhNMG9nS0VJQ0FnSUNUdnYyMnBBRRAB',
 'originalText': {'languageCode': 'en',
                  'text': 'Huge "eating palace" which is lacking the fine '
                          'accents of the italian food although the service '
                          'was top. I would recommend for lunch, less for '
                          'dinner'},
 'publishTime': '2024-05-15T09:37:18Z',
 'rating': 4,
 'relativePublishTimeDescription': 'in the last week',
 'text': {'languageCode': 'en',
          'text': 'Huge "eating palace" which is lacking the fine accents of '
                  'the italian food although the service was top. I would '
                  'recommend for lunch, less for dinner'}}
```


In [11]:
# ['*'] for all the details or from ['name', 'id', 'types', 'nationalPhoneNumber', 'internationalPhoneNumber', 'formattedAddress', 'addressComponents', 'plusCode', 'location', 'viewport', 'rating', 'googleMapsUri', 'websiteUri', 'regularOpeningHours', 'utcOffsetMinutes', 'adrFormatAddress', 'businessStatus', 'priceLevel', 'userRatingCount', 'iconMaskBaseUri', 'iconBackgroundColor', 'displayName', 'primaryTypeDisplayName', 'takeout', 'delivery', 'dineIn', 'curbsidePickup', 'servesBreakfast', 'servesLunch', 'servesDinner', 'servesBeer', 'servesWine', 'servesVegetarianFood', 'currentOpeningHours', 'primaryType', 'shortFormattedAddress', 'editorialSummary', 'reviews', 'photos', 'outdoorSeating', 'menuForChildren', 'servesDessert', 'servesCoffee', 'goodForChildren', 'restroom', 'goodForGroups', 'paymentOptions', 'parkingOptions', 'accessibilityOptions']
PLACE_DETAILS_FIELDS = ('*')
DATA_PATH = 'data_v2/'
# PLACES_CSV_FIELDS = ('id', 'displayName.text', 'types', 'formattedAddress', 'rating', 'userRatingCount','reviewsCount', 'location', 'websiteUri', 'internationalPhoneNumber', 'priceLevel', 'primaryType', 'delivery', 'dineIn', 'reservable', 'servesBreakfast', 'servesLunch', 'servesDinner', 'servesBeer', 'servesWine', 'regularOpeningHours.weekdayDescriptions')
PLACES_CSV_FIELDS = {
    'id':'place_id',
    'displayName.text':'place_name',
    'types':'place_types',
    'formattedAddress':'place_address',
    'rating':'place_average_ratings',
    'userRatingCount':'place_ratings_count',
    'reviewsCount':'place_reviews_count',
    'location':'place_location',
    'websiteUri':'place_website',
    'internationalPhoneNumber':'place_phone_number',
    'priceLevel':'place_price_level',
    'primaryType':'place_primary_type',
    'delivery':'has_delivery',
    'dineIn':'has_dine_in',
    'reservable':'is_reservable',
    'servesBreakfast':'serves_breakfast',
    'servesLunch':'serves_lunch',
    'servesDinner':'serves_dinner',
    'servesBeer':'serves_beer',
    'servesWine':'serves_wine',
    'regularOpeningHours.weekdayDescriptions':'places_opening_hours',
}

REVIEWS_CSV_FIELDS = {
    'text.text':'review',
    'rating':'review_rating',
    'publishTime':'review_publish_time',
    'relativePublishTimeDescription':'review_publish_time_str'
}


# create data folder if it doesn't exist
if not os.path.exists(os.path.join(DATA_PATH,'places')):
    os.makedirs(os.path.join(DATA_PATH,'places'))

## Fetch places

In [12]:
def fetch_places(data, fields: list = ['places.id']):
    '''
    this function fetches the places near a point
    
    data is a dict
    fields is used to fetch the places, default is ['places.id']
    
    returns a dict
    '''
    headers = {
            "Content-Type": "application/json",
            "X-Goog-Api-Key": os.getenv("GMAPS_API_KEY"),
            "X-Goog-FieldMask": ",".join(fields)
    }
    res = requests.post("https://places.googleapis.com/v1/places:searchNearby", json=data, headers=headers)
    if res.status_code == 200:
        return res.json()
    else:
        raise Exception(f"Error while fetching places with status {res.status_code}: {res.text}")

In [13]:
def fetch_place_details(place_id, fields: tuple = PLACE_DETAILS_FIELDS):
    '''
    this function fetches the details of a place

    fields is used to fetch the place details, default is all

    returns a dict
    '''
    headers = {
            "Content-Type": "application/json",
            "X-Goog-Api-Key": os.getenv("GMAPS_API_KEY"),
            "X-Goog-FieldMask": ",".join(fields)
    }
    res = requests.get("https://places.googleapis.com/v1/places/"+place_id, headers=headers)
    if res.status_code == 200:
        return res.json()
    else:
        raise Exception(f"Error while fetching place details with status {res.status_code}: {res.text}")

In [14]:
def save_place(place, prefix=''):
    '''
    place is a dict
    prefix is used to prefix the save path
    name format: prefix+place['id']+'.json'
    
    Example:
     - bar-ChIJ-y2_lD7afkcRfuWr-pW2IEU.json

    returns the save path
    '''
    path = os.path.join(DATA_PATH, 'places', prefix+place['id']+'.json')
    with open(path, 'w') as f:
        json.dump(place, f)
    return path

In [50]:
def get_places(*place_params, detail_fields: tuple = PLACE_DETAILS_FIELDS, save_prefix='', update_existing=False, filter_func=None):
    '''
    this function gets a variable number of place parameters, fetch all places nearby, iterate all of them, save and yield them

    yield (place_id, place, save_path)

    place is None if it already exists
    place is an Exception if it failed to fetch
    save_path is None if it already exists
    save_path is an Exception if it failed to save

    if update_existing is False, it will not yield anything if it already exists

    save_prefix is used to prefix the save path

    detail_fields is used to fetch the place details, default is all
    '''
    places_ids = []
    for pparams in place_params:
        places = fetch_places(pparams)
        places_ids.extend(map(lambda p:p['id'], places.get('places', [])))
    
    places_ids = list(set(places_ids))  # remove duplicates
    
    for place_id in places_ids:
        if not update_existing and os.path.exists(os.path.join(DATA_PATH, 'places', save_prefix+place_id+'.json')):
            yield (place_id, None, None)
            continue

        try:
            place = fetch_place_details(place_id, detail_fields)
        except Exception as e:
            yield (place_id, e, None)
            continue

        if filter_func and not filter_func(place):
            yield (place_id, place, ValueError("Filtered out"))
            continue

        try:
            save_path = save_place(place, save_prefix)
            yield (place_id, place, save_path)
        except Exception as e:
            yield (place_id, place, e)


In [52]:
# Some points in padova
_padova_centers = [
    (45.432080517437015, 11.836634274862599),
    (45.43189031693807, 11.86389813937401),
    (45.43091349677315, 11.88782288754863),
    (45.43246871650895, 11.915365804105923),
    (45.43148119364456, 11.94012631262187),
    (45.43146125556705, 11.969341736033988),
    (45.41271980759355, 11.967083311747375),
    (45.39553968521662, 11.965942341976351),
    (45.378357157970584, 11.967304932229434),
    (45.37896317339665, 11.940063561396899),
    (45.378391245179294, 11.911989485739133),
    (45.37898277267307, 11.889476486536427),
    (45.38015518299662, 11.864461923558604),
    (45.37917422394929, 11.839169376285948),
    (45.39733000587545, 11.837212786821507),
    (45.414510476057394, 11.838870748421774),
    (45.41451501634149, 11.863900879788856),
    (45.41412398624116, 11.886983869128223),
    (45.41333649866577, 11.91423835937508),
    (45.41236094605545, 11.912568969806468),
    (45.41351957291172, 11.940104133710923),
    (45.39614254228134, 11.942307796862078),
    (45.39654731576001, 11.913113854043583),
    (45.39616285944836, 11.889759484314053),
    (45.39674948309286, 11.863347545128061)
]


restaurant_params = [
    {
        "includedTypes": ["restaurant"],
        # "maxResultCount": 20,
        "locationRestriction": {
            "circle": {
                "center": {
                    "latitude": lat,
                    "longitude": long
                },
                "radius": 1300
            },
        }
    }
    for (lat, long) in _padova_centers
]
bar_params = [
    {
        "includedTypes": ["bar"],
        # "maxResultCount": 20,
        "locationRestriction": {
            "circle": {
                "center": {
                    "latitude": lat,
                    "longitude": long
                },
                "radius": 1300
            },
        }
    }
  for (lat, long) in _padova_centers
]


restaurants = get_places(*restaurant_params, save_prefix='restaurant-')
bars = get_places(*bar_params, save_prefix='bar-')

total_ids = 0
new_ids = 0
for place_id, place, save_path in chain(restaurants, bars):
    total_ids += 1

    if place is None:
        # it already exists
        continue
    if isinstance(place, Exception):
        print("Error while fetching place details with id:", place_id, "error:", place)
        continue
        
    if isinstance(save_path, str):
        print("New place found and saved with id:", place_id, "at", save_path)
        new_ids += 1
    elif isinstance(save_path, Exception):
        print("New place found with id:", place_id, "but COULD NOT SAVE IT, error:", save_path)
    else:
        print("New place found with id:", place_id, "but COULD NOT SAVE IT")
    

print("New places found:", new_ids, "/", total_ids)
        


New place found and saved with id: ChIJk5tGn1nQfkcRxYFTz_EP10k at data_v2/places/restaurant-ChIJk5tGn1nQfkcRxYFTz_EP10k.json
New place found and saved with id: ChIJo7j-GxvbfkcRyD0XcYNKRT0 at data_v2/places/restaurant-ChIJo7j-GxvbfkcRyD0XcYNKRT0.json
New place found and saved with id: ChIJ--NFldPafkcRyloMacuXomc at data_v2/places/restaurant-ChIJ--NFldPafkcRyloMacuXomc.json
New place found and saved with id: ChIJV8vVJYHafkcRC_L5A4vzhlg at data_v2/places/restaurant-ChIJV8vVJYHafkcRC_L5A4vzhlg.json
New place found and saved with id: ChIJHZbEvGLFfkcRHz7q7VnBtxY at data_v2/places/restaurant-ChIJHZbEvGLFfkcRHz7q7VnBtxY.json
New place found and saved with id: ChIJGYYYWCPZfkcRU0YMBAvNFng at data_v2/places/restaurant-ChIJGYYYWCPZfkcRU0YMBAvNFng.json
New place found and saved with id: ChIJLQY7-yPFfkcR587y39idso0 at data_v2/places/restaurant-ChIJLQY7-yPFfkcR587y39idso0.json
New place found and saved with id: ChIJc34OU1HafkcRNil1puEkTg8 at data_v2/places/restaurant-ChIJc34OU1HafkcRNil1puEkTg8.json


## Read Places and Reviews

In [16]:
def read_places(prefix='', limit=None):
    '''
    This function reads all the places in the places folder
    prefix is used to filter the files
    limit is used to limit the number of files read

    yield (file path, place_dict)

    place_dict is None if it failed to load or doesn't start with prefix
    place_dict is an Exception if it failed to load
    '''
    dirpath = os.path.join(DATA_PATH, 'places')
    i = 0
    for p in os.scandir(dirpath):
        if not p.is_file() or not p.name.startswith(prefix):
            yield (p.name, None)
            continue
        
        i += 1
        if limit is not None and i > limit:
            break

        with open(p.path) as f:
            try:
                yield (p.name, json.load(f))
            except Exception as e:
                yield (p.name, e)

In [19]:
def read_reviews(places_iter, limit=None):
    '''
    This function reads all the reviews of the given places iterator
    limit is used to limit the number of reviews read

    yield (place_id, review_dict)
    '''
    i = 0
    for path, place in places_iter:
        for review in place.get('reviews', []):
            i += 1
            if limit is not None and i > limit:
                break
            yield (place['id'], review)

In [20]:
i = 0
for r in read_reviews(read_places()):
    # print(r)
    i += 1
print("Total number of reviews:", i)

Total number of reviews: 1312


### Save CSV datasets

In [21]:
# Some utils

# perform flatten before write csv, allows nested fields like: text.languageCode.
# BUT, it makes reading the csv more difficult technically
# TODO: reconsider this

def flatten_dict(d, parent_key='', sep='.'):
    '''
    this function flattens a dictionary
    '''
    items = []
    for k, v in d.items():
        new_key = f'{parent_key}{sep}{k}' if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def flatten_and_filter_dict(review, fields:dict):
    flat_review = flatten_dict(review)
    return {new_field_name: flat_review.get(field_name, '') for field_name, new_field_name in fields.items()}


#### Save Places info

In [24]:
def save_place_csv(places_iter, file_path=os.path.join(DATA_PATH, 'places.csv'), fields: dict = PLACES_CSV_FIELDS):
    saved_ids = set()
    with open(file_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=fields.values())
        writer.writeheader()
        for path, place in places_iter:
            if place['id'] in saved_ids:
                continue
            else:
                saved_ids.add(place['id'])
                
            flattened_place = flatten_and_filter_dict(place, fields)
            writer.writerow(flattened_place)
    return file_path    


#### Save Reviews

In [23]:
def save_reviews_csv(reviews_iter, file_path=os.path.join(DATA_PATH, 'reviews.csv'), fields: dict = REVIEWS_CSV_FIELDS):
    '''
    reviews_iter is an iterator of (place_id, review_dict)
    file_path is the path to save the csv
    fields is the fields to include in the csv
    fields also include 'place_id'


    returns the file path
    '''
    fields = {'place_id': 'place_id', **fields}
    with open(file_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=fields.values())
        writer.writeheader()
        for place_id, review in reviews_iter:
            review['place_id'] = place_id
            flattened_review = flatten_and_filter_dict(review, fields)
            writer.writerow(flattened_review)
    return file_path

In [63]:
ALLOWED_PRIMARY_TYPES = [
    "restaurant", "bar", "cafe", "meal", "coffee_shop", "sandwich", "pizza", "steak"
]


__saved_places = set()
def place_filter(item):
    _, place = item

    if place['id'] in __saved_places:
        return False
    else:
        __saved_places.add(place['id'])

    if place.get("userRatingCount", 0) < 50:
        return False
    
    if len(place.get("reviews", [])) < 5:
        return False
    
    primaryType: str = place.get("primaryType", "")

    # print(primaryType, not primaryType.endswith("restaurant") and not primaryType.endswith("bar"))
    if not any(x in primaryType for x in ALLOWED_PRIMARY_TYPES):
        # print(primaryType)
        return False
    
    return True

def review_filter(item):
    _, review = item
    # text = review.get("text", {}).get("text", "")

    return True


all_places = list(read_places())
filtered_places = list(filter(place_filter, all_places))

all_reviews = list(read_reviews(filtered_places))
filtered_reviews = list(filter(review_filter, all_reviews))

print("All places:", len(all_places))
print("Filtered places:", len(filtered_places))
print("All reviews:", len(all_reviews))
print("Filtered reviews:", len(filtered_reviews))

os.remove('data_v2/places.csv')
os.remove('data_v2/reviews.csv')

save_place_csv(filtered_places)
save_reviews_csv(filtered_reviews)

All places: 796
Filtered places: 513
All reviews: 2565
Filtered reviews: 2565


'data_v2/reviews.csv'

#### Save final dataset