# Spatial analysis of Tripadvisor data

## Libraries and Settings

In [None]:
# Install library spacy (in Terminal, in activated environment)
# pip install spacy
# python -m spacy download en_core_web_sm

# Libraries
import spacy
import os
import requests
import json
import urllib
import fnmatch
import folium
import platform
import pandas as pd
import geopandas as gpd
from IPython.display import clear_output

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

print(os.getcwd())

## Import data

In [None]:
df = pd.read_csv('tripadvisor.csv')

print(df.shape)

df.head()

## Extract locations

### Show locations in titles

In [None]:
df['title_raw']

### Extract locations by using NLP methods

In [None]:
# Load the German NLP model
nlp = spacy.load("de_core_news_sm")

# Example texts
texts = df['title_raw']

# Function to extract locations
def extract_locations(texts):
    locations = []
    for text in texts:
        doc = nlp(text)
        for ent in doc.ents:
            if ent.label_ == "LOC" or ent.label_ == "GPE":
                locations.append(ent.text)
    return locations

# Extract locations
location_names = extract_locations(texts)

# Create a DataFrame
df_locations = pd.DataFrame(location_names, columns=['Location'])

# Remove duplicate locations
df_locations_unique = df_locations.drop_duplicates().reset_index(drop=True)

# Filter out location names longer than XX characters
df_locations_filtered = df_locations_unique[df_locations_unique['Location'].apply(len) <= 25]

# Display the DataFrame
print(df_locations_filtered)


## Geocode locations

In [None]:
# Define the base URL for the GeoAdmin API
base_url = "https://api3.geo.admin.ch/rest/services/api/SearchServer?"

# Initialize a list to store coordinates or NA along with location names
coordinates_list = []

# Loop through each address in the DataFrame
for location in df_locations_filtered['Location']:
    # Set up search parameters for each location
    parameters = {
        "searchText": location,
        "origins": "address",
        "type": "locations"
    }

    # Send the request to the GeoAdmin API
    response = requests.get(base_url, params=parameters)

    # Check the response status and process the data
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            first_result = data['results'][0]['attrs']
            lat = first_result.get('lat', None)
            lon = first_result.get('lon', None)
            coordinates_list.append([location, lat, lon])
        else:
            coordinates_list.append([location, None, None])  # Append NA if no results found
    else:
        coordinates_list.append([location, None, None])  # Append NA if request failed

# Convert the list to a DataFrame with specified column names
df_geocoded = pd.DataFrame(coordinates_list, columns=['Location', 'Latitude', 'Longitude'])

# Display the DataFrame
print(df_geocoded)

## Plot on map

In [None]:
# Initialisierung der Map
m = folium.Map(location=[47.44, 8.65], zoom_start=8)

# Add lat/lon of addresses
df_sub = df_geocoded.dropna()
for i in range(0, len(df_sub)):
    folium.Marker(location=(df_sub.iloc[i]['Latitude'], 
                            df_sub.iloc[i]['Longitude']), 
                  popup=df_sub.iloc[i]['Location']).add_to(m)

# Layer control
folium.LayerControl().add_to(m)

# Plot map
m

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')