# Tripadvisor data

## Description

This notebook is used for getting Tripanvisor geodata via site's API

https://www.tripadvisor.com/data/1.0/maps/alsoShow/boundingBox.

**API required parameters**:

- bounding box coordinates
- number of results to display for each object category (default value - 20)
- rc - unknown parameter, left empty by default

**Output parameters**:

- geographical coordinates (`lat`, `lon`, projection: `EPSG:4326`)
- `id`
- `name`
- `entityType` - main type of object (hotel, restaurant, attraction)
- `bubbleRating` - place rating, [0, 50], step - 5
- `numReviews` - number of reviews on Tripadvisor
- `detailUrl` - relative URL of the entity, gives access for rewiews' texts

Other parameters differdepending on `entityType` field value.

## Required imports

In [1]:
import pandas as pd
import requests
from tqdm import tqdm
import numpy as np
import geopandas as gpd
import time
import json
from collections import defaultdict

tripadvisor_map = 'https://www.tripadvisor.com/data/1.0/maps/alsoShow/boundingBox'

# Functions

## Geocoding

In [21]:
import geocoder

def gcode_osm(address):
    """
    Sends request to Nominatim geocoding server, returns `dict` with data, else returns NaN
    """
    try:
        response = geocoder.arcgis(address).json
    except Exception as e:
        print(e)
        response = None
    return response
        

def getLocation(response):
    """
    Returns city location
    """
    try:
        lat, lon = response['lat'], response['lng']
    except Exception as e:
        print(e)
        lat = lon = None
    return lat, lon

## Grid Generation

In [22]:
from math import cos, radians, ceil

def lon_step(lat, margin):
    """
    Returns longitude step for cells on the current latitude
    """
    lon_step = margin / (cos(radians(lat)) * 111.3 * 1000)
    return lon_step
    

CELL_SIDE = 500    
LAT_STEP = CELL_SIDE / (111.1 * 1000)

DEG_CONST_LAT = 2500 / (111.1 * 1000)

def getBBox(response):
    """
    Returns city bounding box
    """
    try:
        bbox = response['bbox']
        maxx, maxy = bbox['northeast'][1], bbox['northeast'][0]
        minx, miny = bbox['southwest'][1], bbox['southwest'][0]
    except:
        lat, lon = getLocation(response)
        
        DEG_CONST_LON = lon_step(lat, 2500)
        
        maxx, minx = lon + DEG_CONST_LON, lon - DEG_CONST_LON
        maxy, miny = lat + DEG_CONST_LAT, lat - DEG_CONST_LAT
    bbox = {'miny' : miny, 'minx' : minx, 'maxy' : maxy, 'maxx' : maxx}
    return bbox

def createGrid(bbox, latitude_step, longitude_step):
    grid = pd.DataFrame()
    for i in np.arange(bbox['miny'], bbox['maxy'] + latitude_step, latitude_step): # rows
        for j in np.arange(bbox['minx'], bbox['maxx'] + longitude_step, longitude_step): # cols
            grid = grid.append(pd.Series({'miny' : i, 'minx' : j, 'maxy' : i + latitude_step, 'maxx' : j + longitude_step}), ignore_index = True)
    grid['id'] = pd.Series([str(int(i)) for i in range(0, len(grid))])
    return grid

## Tripadvisor API requests

In [23]:
def trip_request(miny, minx, maxy, maxx):
    """
    Sends request to Tripadvisor API. Returns result dictionary
    """
    params = {
        'minLat' : miny,
        'minLng' : minx,
        'maxLat' : maxy,
        'maxLng' : maxx,
        'hotelCount' : 0,
        'attractionCount' : 0,
        'restaurantCount' : 300,
        'rc' : ''
    }
    
    ta_result = requests.get(tripadvisor_map, params = params).json()
    
    return ta_result

def basic_parce(obj):
    """
    Parces results dictionary. Returns list of objects common for all the entities
    """
    uid = obj['id']
    name = obj['name']
    entityType = obj['entityType']
    rating = obj['bubbleRating']
    number_of_reviews = obj['numReviews']
    
    geo = obj['geoPoint']
    lat, lon = geo['latitude'], geo['longitude']
    detail_url = obj['detailUrl']
    try:
        thumbnail = obj['thumbnail']
        helpful_reviews, date_publish, date_upload = thumbnail['helpful_votes'], thumbnail['published_date'], thumbnail['uploaded_date']
    except:
        helpful_reviews = date_publish = date_upload = None
        
    return [uid, name, entityType, rating, number_of_reviews, helpful_reviews, date_publish, date_upload, detail_url, lat, lon]

def selective_parce(obj, major_cat):
    """
    Parces results dictionary. Returns list of objects specific for each entity type
    """
    if major_cat == 'restaurants':
#         raw_cuisines, cuisines = obj['cuisines'], []
#         for cuis in raw_cuisines:
#             cuisines.append(cuis['name'])
#         cuisines = '|'.join(cuisines)
        cuisines = '-*-'.join([x['name'] for  x in obj['cuisines']])
        price = obj['priceString']
        
        return [cuisines, price]
    
    elif major_cat == 'attractions':
        category = obj['category']
        openHours = obj['openHours']

        return [category,openHours] 
    else:
        popularity = obj['popIndexText'].replace(' ','').replace('of','/').replace('Moscowhotels','').replace('#','')
        accommodationCategory = obj['accommodationCategory']
        offers = obj['offers']
        
        return [accommodationCategory,offers,popularity]

In [24]:
common_names = ['uid', 'name', 'entityType', 'rating', 'number_of_reviews', 'helpful_reviews', 'date_publish', 'date_upload', 'detail_url', 'lat', 'lon']

rules_of_naming = {
    'restaurants' : common_names + ['cuisines', 'price'],
    'attractions' : common_names + ['category', 'openHours'],
    'hotels' : common_names + ['accommodationCategory', 'offers', 'popularity']
}

# Prepare cities list to mining

In [35]:
cities_total = pd.read_excel('D:/data_projects/japanFood_worldwide/cities_list.xlsx', sheet_name= None)
cities_total.keys()

odict_keys(['Europe', 'Africa', 'NA', 'SA', 'Asia', 'Oceania', 'Russia', 'China'])

In [36]:
def create_addresses(df):
    df.columns = [i.lower().strip() for i in list(df)]
    df['country'] = df.country.str.split(' - ').str[0].str.lower().str.replace('\(islamic republic of\)', '').str.strip()
    df['country'] = df['country'].str.replace(' (islamic republic of)', '')
    df['city'] = df.city.str.lower().str.split('(').str[0].str.replace(r'\d+', '').str.strip()
    df['address'] = df.city + ', ' + df.country
    return df

In [59]:
total_cities = defaultdict(dict)
for region in cities_total:
    region_list = []
    region_df = create_addresses(cities_total[region])
    
    for i, city in tqdm(region_df.iterrows()):
        response = gcode_osm(city['address'])
        lat, lon = getLocation(response)
        if lat == None or lon == None:
            continue
        bbox = getBBox(response)
        
        region_list.append({
            'region' : region,
            'city' : city['city'],
            'country' : city['country'],
            'population' : city['population'],
            'lat' : lat,
            'lon' : lon,
            'bbox' : bbox
        })
        
        time.sleep(1)
    total_cities[region] = region_list

101it [02:41,  1.65s/it]
134it [03:30,  1.54s/it]
121it [04:01,  2.35s/it]
85it [02:15,  1.60s/it]
280it [07:30,  1.63s/it]
30it [00:47,  1.58s/it]
36it [00:56,  1.57s/it]
117it [03:06,  1.57s/it]


In [70]:
with open ('total_cities.json', 'w') as outfile:
    json.dump(total_cities, outfile)

# Working with Tripadvisor API

In [7]:
with open('total_cities.json') as infile:
    total_cities = json.load(infile)

In [72]:
total_cities.keys()

dict_keys(['Europe', 'Africa', 'NA', 'SA', 'Asia', 'Oceania', 'Russia', 'China'])

## CHOOSE REGION

In [37]:
cities = gpd.read_file('bbox_final.gpkg')
cities.head()

Unnamed: 0,bbox,city,country,lat,lon,population,region,geometry
0,"{""miny"": 41.271320000000046, ""minx"": 19.774170...",tirana,albania,41.32232,19.82517,418495,Europe,"POLYGON ((19.87617000000007 41.27132000000005,..."
1,"{""miny"": 42.46814000000004, ""minx"": 1.48904000...",andorra la vella,andorra,42.50514,1.52604,22205,Europe,"POLYGON ((1.563040000000023 42.46814000000004,..."
2,"{""miny"": 48.05263000000006, ""minx"": 16.2184200...",wien,austria,48.20263,16.36842,1867582,Europe,"POLYGON ((16.51842000000007 48.05263000000006,..."
3,"{""miny"": 53.76675000000006, ""minx"": 27.4284300...",minsk,belarus,53.90375,27.56543,1974819,Europe,"POLYGON ((27.70243000000005 53.76675000000006,..."
4,"{""miny"": 51.09312000000008, ""minx"": 4.26869000...",antwerpen,belgium,51.22212,4.39769,498473,Europe,"POLYGON ((4.526690000000068 51.09312000000008,..."


In [38]:
cities = pd.concat([cities, cities.geometry.bounds], axis = 1)

In [39]:
reg = 'Europe'

In [40]:
cities_list = cities[cities.region == reg]

In [21]:
def exports(request_list, df_list, address, prelimnary = True):
    """
    Exports data to prelimnary and final files
    """
    if prelimnary:
        suffix = 'part'
    else:
        suffix = 'full'
        
    
    df_requests = pd.DataFrame(request_list, columns = ['id', 'request'])

    with open(f'D:/data_projects/japanFood_worldwide/raw_data/requests/{address}_{suffix}.json', 'w') as outfile:
        json.dump(dict(zip(df_requests['id'], df_requests['request'])), outfile)

    data = pd.concat(df_list, sort = True)
    data.loc[data['cuisines'] == '[]'] = None

    data = data[rules_of_naming['restaurants']]
    data.to_csv(f'D:/data_projects/japanFood_worldwide/raw_data/data_tables/{address}_{suffix}_v2.csv', index = None)

In [19]:
def get_bbox(row):
    """
    Get bbox out of row
    """
    bbox = {'miny' : row.miny, 'minx' : row.minx, 'maxy' : row.maxy, 'maxx' : row.maxx}
    return bbox

In [None]:
cities_location = []
for i, city in cities_list.iterrows():
    address = f"{city['city']}, {city['country']}"
    centroid = city.geometry.centroid
#     lat, lon = city['lat'], city['lon']
    lat, lon = centroid.y, centroid.x
    longitude_step = lon_step(lat, CELL_SIDE)
#     bbox = city['bbox']
    bbox = get_bbox(city)
#     break
    grid = createGrid(bbox, LAT_STEP, longitude_step)
    
    cities_location.append([city['city'], city['country'], city['region'], city['population'], lat, lon])
    
    df_list = []
    request_list = []
    for i, row in grid.iterrows():
        maxx = row.maxx
        maxy = row.maxy
        minx = row.minx
        miny = row.miny


        try:
            ta_result = trip_request(miny, minx, maxy, maxx) 
            request_list.append([row.id, ta_result])
        except Exception as e:
            print (e)
#         for major_cat in ['hotels', 'restaurants', 'attractions']:
        for major_cat in ['restaurants']:
            objects = ta_result[major_cat]
            table = []
            for obj in objects:
                try:
                    current_datum = basic_parce(obj) + selective_parce(obj, major_cat)
                    table.append(current_datum)
                except Exception as e:
                    print(e)
            df_list.append(pd.DataFrame(table, columns = rules_of_naming[major_cat]))


        if i % 100 == 0 and i != 0:
            time.sleep(3)
            print(f'Prelimnary export for {address.title()}')
            exports(request_list, df_list, address, prelimnary = True)

    exports(request_list, df_list, address, prelimnary = False)
    
    print(f'{address.title()} ✅')
       
    
pd.DataFrame(
    cities_location, columns = ['city', 'country', 'region', 'population', 'lat', 'lon']).to_csv(f'D:/data_projects/japanFood_worldwide/gecoded_cities_{reg}.csv', index = None)