# Spatial analysis of Tripadvisor data

## Libraries and Settings

In [3]:
# Install library spacy (in Terminal, in activated environment)
# pip install spacy
# python -m spacy download en_core_web_sm

# Libraries
import spacy
import os
import requests
import json
import urllib
import fnmatch
import folium
import platform
import pandas as pd
import geopandas as gpd
from IPython.display import clear_output

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

print(os.getcwd())

u:\Lektionen\WPM\spatial_data_analysis\08_Python_Tripadvisor


## Import data

In [4]:
df = pd.read_csv('tripadvisor.csv')

print(df.shape)

df.head()

(282, 7)


Unnamed: 0,web-scraper-order,web-scraper-start-url,title_raw,price_raw,ranking_raw,duration_raw,text_raw
0,1713267369-1,https://www.tripadvisor.ch/Attraction_Products...,31. Titlis-Tagesausflug mit privatem Reiseleit...,CHF 496,5.0 von fünf Punkten1,6 - 8 Stunden,31. Titlis-Tagesausflug mit privatem Reiseleit...
1,1713267369-2,https://www.tripadvisor.ch/Attraction_Products...,32. Alpine Majestät: Exklusive Privattour von ...,CHF 699,,Mehr als 6 Stunden,32. Alpine Majestät: Exklusive Privattour von ...
2,1713267369-3,https://www.tripadvisor.ch/Attraction_Products...,33. Privattour zu den atemberaubendsten Inside...,CHF 1’999,5.0 von fünf Punkten47,Mehr als 6 Stunden,33. Privattour zu den atemberaubendsten Inside...
3,1713267369-4,https://www.tripadvisor.ch/Attraction_Products...,34. Vierwaldstättersee Pick and Mix Tour - Bur...,CHF 689,5.0 von fünf Punkten34,8 - 9 Stunden,34. Vierwaldstättersee Pick and Mix Tour - Bur...
4,1713267369-5,https://www.tripadvisor.ch/Attraction_Products...,35. Atemberaubende private Wanderung mit einem...,CHF 899,5.0 von fünf Punkten26,8 - 9 Stunden,35. Atemberaubende private Wanderung mit einem...


## Extract locations

### Show locations in titles

In [5]:
df['title_raw']

0      31. Titlis-Tagesausflug mit privatem Reiseleit...
1      32. Alpine Majestät: Exklusive Privattour von ...
2      33. Privattour zu den atemberaubendsten Inside...
3      34. Vierwaldstättersee Pick and Mix Tour - Bur...
4      35. Atemberaubende private Wanderung mit einem...
                             ...                        
277    248. Lugano und Morcote, private Führung ab Ma...
278      249. VIP-Erlebnis zum Comer See und nach Lugano
279    250. Jungfraujoch Top of Europe und Region Pri...
280    251.  Ganztägiger Privatausflug von Zürich nac...
281    252. Ab Zürich: Private Schneewanderung, Berg-...
Name: title_raw, Length: 282, dtype: object

### Extract locations by using NLP methods

In [6]:
# Load the German NLP model
nlp = spacy.load("de_core_news_sm")

# Example texts
texts = df['title_raw']

# Function to extract locations
def extract_locations(texts):
    locations = []
    for text in texts:
        doc = nlp(text)
        for ent in doc.ents:
            if ent.label_ == "LOC" or ent.label_ == "GPE":
                locations.append(ent.text)
    return locations

# Extract locations
location_names = extract_locations(texts)

# Create a DataFrame
df_locations = pd.DataFrame(location_names, columns=['Location'])

# Remove duplicate locations
df_locations_unique = df_locations.drop_duplicates().reset_index(drop=True)

# Filter out location names longer than XX characters
df_locations_filtered = df_locations_unique[df_locations_unique['Location'].apply(len) <= 25]

# Display the DataFrame
print(df_locations_filtered)


             Location
0              Luzern
1         der Schweiz
3    Rigi Seebodenalp
4          Privattour
5     Schweizer Alpen
..                ...
116      Oeschinensee
117  Alpendörfer-Tour
118         Scheidegg
119         Montreaux
120         Hallstatt

[117 rows x 1 columns]


## Geocode locations

In [7]:
# Define the base URL for the GeoAdmin API
base_url = "https://api3.geo.admin.ch/rest/services/api/SearchServer?"

# Initialize a list to store coordinates or NA along with location names
coordinates_list = []

# Loop through each address in the DataFrame
for location in df_locations_filtered['Location']:
    # Set up search parameters for each location
    parameters = {
        "searchText": location,
        "origins": "address",
        "type": "locations"
    }

    # Send the request to the GeoAdmin API
    response = requests.get(base_url, params=parameters)

    # Check the response status and process the data
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            first_result = data['results'][0]['attrs']
            lat = first_result.get('lat', None)
            lon = first_result.get('lon', None)
            coordinates_list.append([location, lat, lon])
        else:
            coordinates_list.append([location, None, None])  # Append NA if no results found
    else:
        coordinates_list.append([location, None, None])  # Append NA if request failed

# Convert the list to a DataFrame with specified column names
df_geocoded = pd.DataFrame(coordinates_list, columns=['Location', 'Latitude', 'Longitude'])

# Display the DataFrame
print(df_geocoded)

             Location   Latitude  Longitude
0              Luzern  47.052547   8.282178
1         der Schweiz  47.425415   8.400759
2    Rigi Seebodenalp  47.062836   8.467160
3          Privattour  46.402309   8.943302
4     Schweizer Alpen        NaN        NaN
..                ...        ...        ...
112      Oeschinensee  46.495811   7.708229
113  Alpendörfer-Tour        NaN        NaN
114         Scheidegg  47.300793   9.357228
115         Montreaux  46.495605   8.934582
116         Hallstatt  46.772114   7.372717

[117 rows x 3 columns]


## Plot on map

In [8]:
# Initialisierung der Map
m = folium.Map(location=[47.44, 8.65], zoom_start=8)

# Add lat/lon of addresses
df_sub = df_geocoded.dropna()
for i in range(0, len(df_sub)):
    folium.Marker(location=(df_sub.iloc[i]['Latitude'], 
                            df_sub.iloc[i]['Longitude']), 
                  popup=df_sub.iloc[i]['Location']).add_to(m)

# Layer control
folium.LayerControl().add_to(m)

# Plot map
m

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [9]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2024-04-16 15:21:07
Python Version: 3.10.11
-----------------------------------
