# Tripadvisor data

## Description

This notebook is used for getting Tripanvisor geodata via site's API https://www.tripadvisor.com/data/1.0/maps/alsoShow/boundingBox.

**API required parameters**:

- bounding box coordinates
- number of results to display for each object category (default value - 20)
- rc - unknown parameter, left empty by default

**Output parameters**:

- geographical coordinates (`lat`, `lon`, projection: `EPSG:4326`)
- `id`
- `name`
- `entityType` - main type of object (hotel, restaurant, attraction)
- `bubbleRating` - place rating, [0, 50], step - 5
- `numReviews` - number of reviews on Tripadvisor
- `detailUrl` - relative URL of the entity, gives access for rewiews' texts

Other parameters differdepending on `entityType` field value.

In [None]:
https://www.tripadvisor.com/data/1.0/maps/alsoShow/boundingBox?minLat=50.033752729003595&minLng=14.352888981931072&maxLat=50.13420851516278&maxLng=14.533820073239669&hotelCount=30&attractionCount=30&restaurantCount=30&rc=

## Required imports

In [1]:
import pandas as pd
import requests
from tqdm import tqdm
import numpy as np
import geopandas as gpd
import time

tripadvisor_map = 'https://www.tripadvisor.com/data/1.0/maps/alsoShow/boundingBox'

In [5]:
bounds = gpd.read_file('/home/mtopnikov/p_piter/tripadvisor/grid_collection.gpkg').geometry.bounds
bounds['id'] = pd.Series([i for i in range(len(bounds))])

bounds.head()

Unnamed: 0,left,top,right,bottom,id,geometry
0,3272766.0,8457557.0,3273266.0,8457057.0,1,"POLYGON ((29.39976106251813 60.25868251723937,..."
1,3272766.0,8457057.0,3273266.0,8456557.0,2,"POLYGON ((29.39976106251813 60.25645423802023,..."
2,3272766.0,8456557.0,3273266.0,8456057.0,3,"POLYGON ((29.39976106251813 60.25422580712826,..."
3,3272766.0,8456057.0,3273266.0,8455557.0,4,"POLYGON ((29.39976106251813 60.25199722455656,..."
4,3272766.0,8455557.0,3273266.0,8455057.0,5,"POLYGON ((29.39976106251813 60.24976849029816,..."


## Functions definition

In [2]:
def trip_request(miny, minx, maxy, maxx):
    params = {
        'minLat' : miny,
        'minLng' : minx,
        'maxLat' : maxy,
        'maxLng' : maxx,
        'hotelCount' : 100,
        'attractionCount' : 100,
        'restaurantCount' : 100,
        'rc' : ''
    }
    
    ta_result = requests.get(tripadvisor_map, params = params).json()
    
    return ta_result

In [8]:
dic

{'liza': 5, 'misha': {'segzs': 2, 'karta': 3}}

In [9]:
geo = dic['misha']

In [10]:
geo['segzs'], geo['karta']

(2, 3)

In [1]:
def basic_parce(obj):
    uid = obj['id']
    name = obj['name']
    entityType = obj['entityType']
    rating = obj['bubbleRating']
    number_of_reviews = obj['numReviews']
    
    geo = obj['geoPoint']
    lat, lon = geo['latitude'], geo['longitude']
    detail_url = obj['detailUrl']
    try:
        thumbnail = obj['thumbnail']
        helpful_reviews, date_publish, date_upload = thumbnail['helpful_votes'], thumbnail['published_date'], thumbnail['uploaded_date']
    except:
        helpful_reviews = date_publish = date_upload = np.NaN
        
    return [uid, name, entityType, rating, number_of_reviews, helpful_reviews, date_publish, date_upload, detail_url, lat, lon]

In [2]:
def selective_parce(obj, major_cat):
    if major_cat == 'restaurants':
        raw_cuisines, cuisines = obj['cuisines'], []
        for cuis in raw_cuisines:
            cuisines.append(cuis['name'])
        cuisines = '|'.join(cuisines)
        price = obj['priceString']
        
        return [cuisines, price]
    
    elif major_cat == 'attractions':
        category = obj['category']
        openHours = obj['openHours']

        return [category,openHours] 
    else:
        popularity = obj['popIndexText'].replace(' ','').replace('of','/').replace('Moscowhotels','').replace('#','')
        accommodationCategory = obj['accommodationCategory']
        offers = obj['offers']
        
        return [accommodationCategory,offers,popularity]

In [None]:
common_names = ['uid', 'name', 'entityType', 'rating', 'number_of_reviews', 'helpful_reviews', 'date_publish', 'date_upload', 'detail_url', 'lat', 'lon']

rules_of_naming = {
    'restaurants' : common_names + ['cuisines', 'price'],
    'attractions' : common_names + ['category', 'openHours'],
    'hotels' : common_names + ['accommodationCategory', 'offers', 'popularity']
}

In [8]:
def parce_json(obj,major_cat):
    uid = obj['id']
    name = obj['name']
    entityType = obj['entityType']
    rating = obj['bubbleRating']
    number_of_reviews = obj['numReviews']
    lat = obj['geoPoint']['latitude']
    lon = obj['geoPoint']['longitude']
    detail_url = obj['detailUrl']
    try:
        helpful_reviews = obj['thumbnail']['helpful_votes']
        date_publish = obj['thumbnail']['published_date']
        date_upload = obj['thumbnail']['uploaded_date']
    except:
        helpful_reviews = np.NaN
        date_publish = np.NaN
        date_upload = np.NaN
    
    
    if major_cat == 'restaurants':
        q = obj['cuisines']
        cuisines = []
        for cuis in q:
            cuisines.append(cuis['name'])
            cuisines = '|'.join(cuisines)
        price = obj['priceString']
        json_datum = [
            uid, name, entityType, cuisines,
            price, rating, number_of_reviews,
            helpful_reviews, date_publish, date_upload,
            detail_url, lat, lon
                     ]
        return json_datum
    elif major_cat == 'attractions':
        category = obj['category']
        openHours = obj['openHours']
        json_datum = [
            uid,name,entityType,category,
            openHours,rating,number_of_reviews,
            helpful_reviews,date_publish,date_upload,
            detail_url ,lat,lon
                 ]
        return json_datum 
    else:
        popularity = obj['popIndexText'].replace(' ','').replace('of','/').replace('Moscowhotels','').replace('#','')
        accommodationCategory = obj['accommodationCategory']
        offers = obj['offers']
        json_datum = [
            uid,name,entityType,accommodationCategory,
            offers,rating,popularity,number_of_reviews,
            helpful_reviews,date_publish,date_upload,
            detail_url ,lat,lon
                 ]
        return json_datum

In [None]:
df_list = []
request_list = []
for i, row in tqdm(bounds.iterrows()):
    maxx = row.maxx
    maxy = row.maxy
    minx = row.minx
    miny = row.miny
    
    
    try:
        ta_result = trip_request(miny, minx, maxy, maxx) 
        request_list.append([row.id, ta_result])
    except Exception as e:
        print (e)
    for major_cat in ['hotels', 'restaurants', 'attractions']:
        objects = ta_result[major_cat]
        table = []
        for obj in objects:
            try:
                current_datum = basic_parce(obj) + selective_parce(obj, major_cat)
                table.append(current_datum)
            except Exception as e:
                print(e)
        df_list.append(pd.DataFrame(table, columns = rules_of_naming[major_cat]))

  

    
    if i % 100 == 0 and i != 0:
        time.sleep(3)
        pd.DataFrame(request_list,
                     columns = ['id', 'request']
                    ).to_csv('/home/mtopnikov/p_import/tripadvisor_requests_part_v2.csv')
        pd.concat(df_list, sort = True).to_csv('/home/mtopnikov/p_piter/tripadvisor/tripadvisor_data_part_v2.csv')
        
pd.DataFrame(request_list,
             columns = ['id', 'request']
            ).to_csv('/home/mtopnikov/p_piter/tripadvisor/tripadvisor_requests_final_v2.csv')
pd.concat(df_list, sort = True).to_csv('/home/mtopnikov/p_piter/tripadvisor/tripadvisor_data_final_v2.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


6863it [1:01:23,  2.45it/s]

'str' object has no attribute 'append'
'str' object has no attribute 'append'
'str' object has no attribute 'append'


8750it [1:24:25,  1.83it/s]

'str' object has no attribute 'append'


8753it [1:24:27,  2.15it/s]

'str' object has no attribute 'append'


9289it [1:31:13,  2.72it/s]

'str' object has no attribute 'append'


10185it [1:45:00,  2.72it/s]

'str' object has no attribute 'append'


10900it [1:55:52,  2.77it/s]

In [13]:
18000000/2.77//3600/24

75.20833333333333

In [14]:
df = pd.read_csv('/home/mtopnikov/p_piter/tripadvisor/tripadvisor_data_final_v2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,accommodationCategory,category,cuisines,date_publish,date_upload,detail_url,entityType,helpful_reviews,lat,lon,name,number_of_reviews,offers,openHours,popularity,price,rating,uid
0,0,,,[],,,/Restaurant_Review-g3619189-d3764778-Reviews-C...,restaurant,,60.183681,29.50642,Chance,1,,,,,50,3764778
1,1,,,Russian,,,/Restaurant_Review-g3619189-d3775375-Reviews-K...,restaurant,,60.183681,29.50642,Krasnaya Zvezda,0,,,,,0,3775375
2,0,,,,,,/Hotel_Review-g2345793-d2343491-Reviews-Hotel_...,hotel,,60.195309,29.52972,Hotel Black River,17,,,2/2Sestroretskhotels,,35,2343491
3,0,,,[],2016-03-03T02:44:23-0500,2016-03-03T02:44:23-0500,/Restaurant_Review-g3618978-d3756959-Reviews-C...,restaurant,0.0,60.195309,29.52972,Chernaya Rechka,4,,,,,45,3756959
4,0,,Nature & Parks,,2014-10-02T08:22:17-0400,2014-09-28T07:32:28-0400,/Attraction_Review-g2418659-d6877696-Reviews-L...,attraction,5.0,60.232838,29.532785,Lindulovskaya Grove,26,,,,,50,6877696


In [15]:
df = df[[
    'uid', 'name', 'entityType', 'accommodationCategory', 'category', 'cuisines',
    'price', 'openHours', 'offers', 'rating', 'popularity', 'number_of_reviews',
    'helpful_reviews', 'date_publish', 'date_upload', 'detail_url' ,'lat', 'lon'
        ]]
df.loc[df['cuisines'] == '[]'] = np.NaN
df.to_csv('/mnt/data3/data_providers/tripadvisor/peterburg_tripadvisor_all_entities.csv', index = None)