# Extracting, analyzing and visualizing spatial entities from Tripadvisor text data

## Libraries and Settings

In [None]:
# Libraries
import os
import sys
import spacy
import requests
import folium
from folium import Popup
import pandas as pd
from tqdm import tqdm
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

print(os.getcwd())

## Import data about day trips in Switzerland

Note that the data were collected using the Chrome Web Scraper extension (see file 'webscraper_tripadvisor.json')

In [None]:
# Data from Tripadvisor derived via web scraping
df = pd.read_csv('tripadvisor.csv')

# Get the shape of the dataframe
print(df.shape)

df.head(10)

## Extract locations by using Named Entity Recognition (NER)

### Show locations in titles

In [None]:
# All titles
print(df['title_raw'])

# Single title
print('\n', df.loc[df['web-scraper-order'] == '1713267369-7']['title_raw'].values[0])

## Plot wordcloud from 'title_raw'

In [None]:
# Join all reviews into a single string and create a WordCloud object
text = ' '.join(review for review in df['title_raw'])

# Create and generate a word cloud image:
wordcloud = WordCloud(background_color="black").generate(text)

# Display the generated image:
plt.figure(figsize=(7, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Named Entity Recognition (NER) example

Note that, when you use a language model in SpaCy, like "en_core_web_sm", it processes the text to perform various Natural Language Processing (NLP) tasks. These tasks typically include tokenization, part-of-speech tagging, dependency parsing, lemmatization, and named entity recognition (NER).

For details see: https://spacy.io/api

In [None]:
# Load the German language model
nlp = spacy.load("de_core_news_sm")

# Example texts with their identifiers
df_example = pd.DataFrame({
                    'id': ['1', '2', '3'],
                    'title_raw': ['Titlis-Tagesausflug mit privatem Reiseleiter ab Zürich.', 
                                  'VIP-Erlebnis zum Comer See und nach Lugano.',
                                  'Private Schneewanderung im Ab Berner Oberland.']})

texts = df_example['title_raw']

# Loop through the texts and extract the entities
for text in texts:
    doc = nlp(text)
    for ent in doc.ents:
        print(ent.text, '|', ent.label_)


### Extract locations from Tripadvisor data

In [None]:
# Load the German language model
nlp = spacy.load("de_core_news_sm")

# Example texts with their identifiers
texts = df[['web-scraper-order', 'title_raw']]

# Function to extract locations, their labels, and web-scraper-order from original data
def extract_locations(texts):
    locations = []
    for idx, row in texts.iterrows():
        doc = nlp(row['title_raw'])
        for ent in doc.ents:
            if ent.label_ == "LOC" or ent.label_ == "GPE":
                locations.append((row['web-scraper-order'], ent.text, ent.label_))
    return locations

# Extract locations
locations_with_labels_ids = extract_locations(texts)

# Create a DataFrame with location names, their labels, and the IDs
df_locations = pd.DataFrame(locations_with_labels_ids, columns=['web-scraper-order', 'location', 'entity_type'])

# Remove duplicate locations
df_locations_unique = df_locations.drop_duplicates().reset_index(drop=True)

# Filter out location names longer than 25 characters
df_locations_filtered = df_locations_unique[df_locations_unique['location'].apply(len) <= 25]

# Display the DataFrame
print(df_locations_filtered)

## Geocode locations

In [None]:
# Define the base URL for the GeoAdmin API
base_url = "https://api3.geo.admin.ch/rest/services/api/SearchServer?"

# Initialize a list to store IDs, coordinates, or NA along with location names
coordinates_list = []

# Loop through each address and ID in the DataFrame (with progress bar)
for idx, row in tqdm(df_locations_filtered.iterrows(),
                     total = df_locations_filtered.shape[0], 
                     bar_format='{l_bar}{bar}'):
    
    # Set up search parameters for each location
    parameters = {
        "searchText": row['location'],
        "origins": "address",
        "type": "locations"
    }

    # Send the request to the GeoAdmin API
    response = requests.get(base_url, params=parameters)

    # Check the response status and process the data
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            first_result = data['results'][0]['attrs']
            lat = first_result.get('lat', None)
            lon = first_result.get('lon', None)
            coordinates_list.append([row['web-scraper-order'], 
                                     row['location'], lat, lon])
        else:
            coordinates_list.append([row['web-scraper-order'], 
                                     row['location'], None, None])
    else:
        coordinates_list.append([row['web-scraper-order'], 
                                 row['location'], None, None])
        
# Convert the list to a DataFrame with specified column names
df_geocoded = pd.DataFrame(coordinates_list, columns=['web-scraper-order', 
                                                      'location', 
                                                      'latitude', 
                                                      'longitude'])

# Display the DataFrame
df_geocoded


## Merge the original DataFrame with the geocoded DataFrame

In [None]:
# Merge the original DataFrame with the geocoded DataFrame
merged_df_orig = pd.merge(df[['web-scraper-order', 
                              'title_raw', 
                              'price_raw', 
                              'ranking_raw', 
                              'duration_raw',
                              'text_raw']], df_geocoded, 
                              on='web-scraper-order', 
                              how='left')
merged_df_orig

# Filter day trips with price information 'per adult'

In [None]:
# Check price 'per adult'
merged_df_orig['price_per_adult'] = merged_df_orig['text_raw'].str.contains('pro Erwachsenem').astype(int)

# Filter only prices with information 'per adult'
merged_df = merged_df_orig[merged_df_orig['price_per_adult'] == 1].reset_index(drop=True)

# Show remaining number of rows
print(merged_df.shape)

# Display the DataFrame
merged_df.head()

## Analyze prices of day trips

In [None]:
# Extract numbers from the string
merged_df['price'] = merged_df['price_raw'].str.strip()
merged_df['price'] = merged_df['price'].str.extract(r'(\d+’?\d*)')
merged_df['price'] = merged_df['price'].str.replace("’", "").astype(int)

# Pivot table to get the average price for each location
pivot_table = merged_df.pivot_table(index='location', 
                                    values='price', 
                                    aggfunc='mean').reset_index()

pivot_table.sort_values(by='price', ascending=False)

# Histogram with average price of trips
plt.figure(figsize=(6, 4))
plt.hist(pivot_table['price'])
plt.title('Avg. price of trips')
plt.xlabel('Price per adult (CHF)')
plt.ylabel('Frequency')
plt.grid()
plt.show()

# Filter most expensive trips
most_expensive_trips = merged_df[merged_df['price'] >= 1500].sort_values(by='price', ascending=False)
most_expensive_trips[['web-scraper-order', 'title_raw', 'price', 'location', 'duration_raw', 'ranking_raw']].head()

## Analyze duration of day trips


In [None]:
# Extract numbers from the string
merged_df['max_duration'] = merged_df['duration_raw'].str.strip()
merged_df['max_duration'] = merged_df['max_duration'].str.extractall(r'(\d+)').astype(int).groupby(level=0).max()

# Pivot table to get the average duration for each trip
pivot_table = merged_df.pivot_table(index='location', 
                                    values='max_duration', 
                                    aggfunc='mean').reset_index()

sorted_pivot = pivot_table.sort_values(by='max_duration', ascending=False)

# Histogram with average max. duration of trips
plt.figure(figsize=(6, 4))
plt.hist(pivot_table['max_duration'])
plt.title('Avg. maximum duration of day trips')
plt.xlabel('Avg. maximum duration (hours)')
plt.ylabel('Frequency')
plt.xlim(0, 12)
plt.grid()
plt.show()


## Analyse rankings of day trips

In [None]:
# Extract numbers from the string
merged_df['ranking'] = merged_df['ranking_raw'].str.strip()
merged_df['ranking'] = merged_df['ranking'].str[:3].astype(float)

# Pivot table to get the average ranking for each trip
pivot_table = merged_df.pivot_table(index='location', 
                                    values='ranking', 
                                    aggfunc='mean').reset_index()

pivot_table.sort_values(by='ranking', ascending=False)

# Histogram with avg. ranking of trips
plt.figure(figsize=(6, 4))
plt.hist(pivot_table['ranking'])
plt.title('Avg. ranking of day trips')
plt.xlabel('Ranking (1 = worst ... 5 = best)')
plt.ylabel('Frequency')
plt.xlim(1, 5)
plt.grid()
plt.show()

merged_df.head()


## Plot locations on map

In [None]:
# Initialisierung der Map
m = folium.Map(location=[47.44, 8.65], zoom_start=8)

# Add lat/lon of addresses
df_sub = merged_df.dropna().drop_duplicates().reset_index(drop=True)

for i in range(0, len(df_sub)):
    popup_text = (
        f"Location: {df_sub.iloc[i]['location']}, "
        f"Ranking: {df_sub.iloc[i]['ranking']}, "
        f"Price per adult: {df_sub.iloc[i]['price']}, "
        f"Duration in hours: {df_sub.iloc[i]['max_duration']}"
    )
    popup = folium.Popup(popup_text, max_width=500)
    folium.Marker(location=(df_sub.iloc[i]['latitude'], df_sub.iloc[i]['longitude']), popup=popup).add_to(m)
    
# Layer control
folium.LayerControl().add_to(m)

# Plot map
m

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')