# Tripadvisor data

## Description

This notebook is used for getting Tripanvisor geodata via site's API

https://www.tripadvisor.com/data/1.0/maps/alsoShow/boundingBox.

**API required parameters**:

- bounding box coordinates
- number of results to display for each object category (default value - 20)
- rc - unknown parameter, left empty by default

**Output parameters**:

- geographical coordinates (`lat`, `lon`, projection: `EPSG:4326`)
- `id`
- `name`
- `entityType` - main type of object (hotel, restaurant, attraction)
- `bubbleRating` - place rating, [0, 50], step - 5
- `numReviews` - number of reviews on Tripadvisor
- `detailUrl` - relative URL of the entity, gives access for rewiews' texts

Other parameters differdepending on `entityType` field value.

## Required imports

In [54]:
import pandas as pd
import requests
from tqdm import tqdm
import numpy as np
import geopandas as gpd
import time
import json

tripadvisor_map = 'https://www.tripadvisor.com/data/1.0/maps/alsoShow/boundingBox'

# Functions

## Geocoding and grid generation

In [82]:
import geocoder
from math import cos, radians, ceil



def gcode_osm(address):
    """
    Sends request to Nominatim geocoding server, returns `dict` with data, else returns NaN
    """
    try:
        response = geocoder.osm(address).json
    except Exception as e:
        print(e)
        response = None
    return response
        

def getLocation(response):
    """
    Returns city location
    """
    try:
        lat, lon = response['lat'], response['lng']
    except:
        lat = lon = None
    return lat, lon

def lon_step(lat, margin):
    """
    Returns longitude step for cells on the current latitude
    """
    lon_step = margin / (cos(radians(lat)) * 111.3 * 1000)
    return lon_step
    

CELL_SIDE = 500    
LAT_STEP = CELL_SIDE / (111.1 * 1000)

DEG_CONST_LAT = 5000 / (111.1 * 1000)

def getBBox(response):
    """
    Returns city bounding box
    """
    try:
        bbox = response['bbox']
        maxx, maxy = bbox['northeast'][1], bbox['northeast'][0]
        minx, miny = bbox['southwest'][1], bbox['southwest'][0]
    except:
        lat, lon = getLocation(response)
        
        DEG_CONST_LON = lon_step(lat, 5000)
        
        maxx, minx = lon + DEG_CONST_LON, lon - DEG_CONST_LON
        maxy, miny = lat + DEG_CONST_LAT, lat - DEG_CONST_LAT
    bbox = {'miny' : miny, 'minx' : minx, 'maxy' : maxy, 'maxx' : maxx}
    return bbox

def createGrid(bbox, latitude_step, longitude_step):
    grid = pd.DataFrame()
    for i in np.arange(bbox['miny'], bbox['maxy'] + latitude_step, latitude_step): # rows
        for j in np.arange(bbox['minx'], bbox['maxx'] + longitude_step, longitude_step): # cols
            grid = grid.append(pd.Series({'miny' : i, 'minx' : j, 'maxy' : i + latitude_step, 'maxx' : j + longitude_step}), ignore_index = True)
    grid['id'] = pd.Series([str(int(i)) for i in range(0, len(grid))])
    return grid

## Tripadvisor API requests

In [71]:
def trip_request(miny, minx, maxy, maxx):
    params = {
        'minLat' : miny,
        'minLng' : minx,
        'maxLat' : maxy,
        'maxLng' : maxx,
        'hotelCount' : 0,
        'attractionCount' : 0,
        'restaurantCount' : 300,
        'rc' : ''
    }
    
    ta_result = requests.get(tripadvisor_map, params = params).json()
    
    return ta_result

In [72]:
def basic_parce(obj):
    uid = obj['id']
    name = obj['name']
    entityType = obj['entityType']
    rating = obj['bubbleRating']
    number_of_reviews = obj['numReviews']
    
    geo = obj['geoPoint']
    lat, lon = geo['latitude'], geo['longitude']
    detail_url = obj['detailUrl']
    try:
        thumbnail = obj['thumbnail']
        helpful_reviews, date_publish, date_upload = thumbnail['helpful_votes'], thumbnail['published_date'], thumbnail['uploaded_date']
    except:
        helpful_reviews = date_publish = date_upload = None
        
    return [uid, name, entityType, rating, number_of_reviews, helpful_reviews, date_publish, date_upload, detail_url, lat, lon]

In [73]:
def selective_parce(obj, major_cat):
    if major_cat == 'restaurants':
        raw_cuisines, cuisines = obj['cuisines'], []
        for cuis in raw_cuisines:
            cuisines.append(cuis['name'])
        cuisines = '|'.join(cuisines)
        price = obj['priceString']
        
        return [cuisines, price]
    
    elif major_cat == 'attractions':
        category = obj['category']
        openHours = obj['openHours']

        return [category,openHours] 
    else:
        popularity = obj['popIndexText'].replace(' ','').replace('of','/').replace('Moscowhotels','').replace('#','')
        accommodationCategory = obj['accommodationCategory']
        offers = obj['offers']
        
        return [accommodationCategory,offers,popularity]

In [74]:
common_names = ['uid', 'name', 'entityType', 'rating', 'number_of_reviews', 'helpful_reviews', 'date_publish', 'date_upload', 'detail_url', 'lat', 'lon']

rules_of_naming = {
    'restaurants' : common_names + ['cuisines', 'price'],
    'attractions' : common_names + ['category', 'openHours'],
    'hotels' : common_names + ['accommodationCategory', 'offers', 'popularity']
}

# Working with Tripadvisor API

In [97]:
cities_total = pd.read_excel('D:/data_projects/japanFood_worldwide/cities_list.xlsx', sheet_name= None)

In [99]:
cities_total.keys()

odict_keys(['Europe', 'Africa', 'NA', 'SA', 'Asia', 'Oceania', 'Russia', 'China'])

## CHOOSE REGION

In [None]:
reg = ''

In [109]:
cities_list = cities_total[reg]
cities_list.columns = [i.lower() for i in list(cities_list)]
cities_list['country'] = cities_list.country.str.split(' - ').str[0].str.strip().str.lower()
cities_list['city'] = cities_list.city.str.lower().str.split('(').str[0].str.replace(r'\d+', '').str.strip()
cities_list['address'] = cities_list.city + ', ' + cities_list.country
cities_list.head()

Unnamed: 0,region,country,city,population,address
0,Asia,china,guangzhou,45600000,"guangzhou, china"
1,Asia,china,shanghai,35900000,"shanghai, china"
2,Asia,china,beijing,20400000,"beijing, china"
3,Asia,china,tianjin,13200000,"tianjin, china"
4,Asia,china,xiamen,9900000,"xiamen, china"


In [114]:
def exports(request_list, df_list, address, prelimnary = True):
    if prelimnary:
        suffix = 'part'
    else:
        suffix = 'full'
        
    
    df_requests = pd.DataFrame(request_list, columns = ['id', 'request'])

    with open(f'D:/data_projects/japanFood_worldwide/raw_data/requests/{address}_{suffix}.json', 'w') as outfile:
        json.dump(dict(zip(df_requests['id'], df_requests['request'])), outfile)

    data = pd.concat(df_list, sort = True)
    data.loc[data['cuisines'] == '[]'] = None

    data = data[rules_of_naming['restaurants']]
    data.to_csv(f'D:/data_projects/japanFood_worldwide/raw_data/data_tables/{address}_{suffix}.csv', index = None)

In [85]:
bad_cities = []
cities_location = []
for i, city in cities_list.iterrows():
    address = city['address']
    
    response = gcode_osm(address)
    if not response:
        print(f'{address.title()} was not geocoded')
        bad_cities.append(address)
        continue
    lat, lon = getLocation(response)
    longitude_step = lon_step(lat, CELL_SIDE)
    bbox = getBBox(response)
    grid = createGrid(bbox, LAT_STEP, longitude_step)
    
    cities_location.append([city['city'], city['country'], city['region'], city['population'], lat, lon])
    
    df_list = []
    request_list = []
    for i, row in grid.iterrows():
        maxx = row.maxx
        maxy = row.maxy
        minx = row.minx
        miny = row.miny


        try:
            ta_result = trip_request(miny, minx, maxy, maxx) 
            request_list.append([row.id, ta_result])
        except Exception as e:
            print (e)
#         for major_cat in ['hotels', 'restaurants', 'attractions']:
        for major_cat in ['restaurants']:
            objects = ta_result[major_cat]
            table = []
            for obj in objects:
                try:
                    current_datum = basic_parce(obj) + selective_parce(obj, major_cat)
                    table.append(current_datum)
                except Exception as e:
                    print(e)
            df_list.append(pd.DataFrame(table, columns = rules_of_naming[major_cat]))


        if i % 100 == 0 and i != 0:
            time.sleep(3)
            print(f'Prelimnary export for {address.title()}')
            exports(request_list, df_list, address, prelimnary = True)

    exports(request_list, df_list, address, prelimnary = False)
    
    print(f'{address.title()} ✅')
    
cities_list = cities_list[cities_list.address.isin(bad_cities)]

if len(cities_list) > 0:
    cities_list.to_excel('D:/data_projects/japanFood_worldwide/bad_citiees.xlsx', index = None)
    
    
pd.DataFrame(
    cities_location, columns = ['city', 'country', 'region', 'population', 'lat', 'lon']).to_csv(f'D:/data_projects/japanFood_worldwide/gecoded_cities_{reg}.csv', index = None)








0it [00:00, ?it/s]






1it [00:00,  2.24it/s]






2it [00:00,  2.27it/s]






3it [00:01,  2.28it/s]






4it [00:01,  2.30it/s]






5it [00:02,  2.30it/s]






6it [00:02,  2.28it/s]






7it [00:03,  2.27it/s]






8it [00:03,  2.29it/s]






9it [00:03,  2.30it/s]






10it [00:04,  2.33it/s]






11it [00:04,  2.06it/s]






12it [00:05,  2.09it/s]






13it [00:05,  2.12it/s]






14it [00:06,  1.98it/s]






15it [00:07,  1.81it/s]






16it [00:07,  1.94it/s]






17it [00:08,  1.81it/s]






18it [00:08,  1.94it/s]






19it [00:09,  2.04it/s]






20it [00:09,  2.08it/s]






21it [00:09,  2.11it/s]






22it [00:10,  2.15it/s]






23it [00:10,  2.20it/s]






24it [00:11,  2.21it/s]






25it [00:11,  2.24it/s]






26it [00:12,  1.98it/s]






27it [00:12,  2.06it/s]






28it [00:13,  2.16it/s]






29it [00:13,  2.20it/s]






30it [00:14,  2.28it/s]






31it [00:14,  2.32it/s]






32it [00:15,  2.01it/s]






33it [00:15,  2.1