<a href="https://colab.research.google.com/github/phenningsson/travels_of_hunt/blob/main/hunt_travels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Install the required packages
# spaCy for Namned Entity Recognition and pre-processing of the text
# textblob for sentiment analysis
# geopy for getting coordinates of extracted locations
# Note: in order for the below code to work, you need to download the txt file
# of Eleonora Hunt's travelogue "My Trip Around the World", available here
# from Project Gutenberg: https://www.gutenberg.org/ebooks/33079
# (also remove the Gutenberg disclaimers from the beginning and end of the text file
# since we are interested in the text from the book itself)
# Many thanks to Project Gutenberg for providing books for free publicly online
!pip install spacy
!pip install textblob
!pip install geopy
!python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
# The below code in this cell extracts all the locations found
# by spaCys NER. While interesting, it is far too many locations
# and falls outside the scope of this project

import spacy
import pandas as pd
import re
from geopy.geocoders import Nominatim
import time

# Load spaCy's large model (better Namned Entity Recognition accuracy)
nlp = spacy.load('en_core_web_lg')

# Initialize geolocator using Nominatim
geolocator = Nominatim(user_agent="ph223ed@student.lnu.se")

# Get latitude and longitude coordinates of location, throw error message if not found
def get_coordinates(location_name):
    try:
        # Add delay to the Nominatim API calls
        time.sleep(1)
        location = geolocator.geocode(location_name)
        if location:
            return (location.latitude, location.longitude)
        return None
    except Exception as e:
        print(f"Error getting coordinates for {location_name}: {e}")
        return None

# Normalize name of location (some are spelled in all caps in the text)
def normalize_location_name(location):
    # Convert to string and remove whitespaces in the beginning and end
    location = str(location).strip()

    # Convert to title case
    location = location.title()

    # Remove 'The' from beginning if present
    if location.startswith('The '):
        location = location[4:]

    # Remove periods and redundant whitespaces
    location = re.sub(r'\.', '', location)
    location = re.sub(r'\s+', ' ', location)

    return location.strip()

# Extract locations with context (text that mentions/describes the location)
def extract_locations_with_context(text):
    doc = nlp(text)
    locations = []

    for ent in doc.ents:
        if ent.label_ in ['GPE', 'LOC']:  # GPE for cities/countries, LOC for locations
            # Get the sentence containing the location for context
            sent = [sent for sent in doc.sents if ent.start >= sent.start and ent.end <= sent.end][0]

            # Normalize location name
            location_name = normalize_location_name(ent.text)

            # Get coordinates
            coordinates = get_coordinates(location_name)

            locations.append({
                'original_name': ent.text,
                'normalized_name': location_name,
                'entity_type': ent.label_,
                'context': sent.text,
                'coordinates': coordinates
            })

    return locations

# Read text file
with open('hunt_travel.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Get the locations and store in a data frame
locations = extract_locations_with_context(text)
df = pd.DataFrame(locations)

# Split coordinates column into separate latitude and longitude columns
df['latitude'] = df['coordinates'].apply(lambda x: x[0] if x else None)
df['longitude'] = df['coordinates'].apply(lambda x: x[1] if x else None)

# Drop the coordinates column
df = df.drop('coordinates', axis=1)

# Sort by normalized name (but keep duplicates since we want all mentions of a given location)
df = df.sort_values('normalized_name')

# Print the total of all location mentions
# Print sample of locations along with their extracted coordinates
print(f"Found {len(df)} location mentions")
print("\nSample of locations with coordinates:")
print(df[['normalized_name', 'latitude', 'longitude']].head())

# Save name of location and its coordinates to a CSV file
df.to_csv('extracted_locations_with_coordinates.csv', index=False)



Found 502 location mentions

Sample of locations with coordinates:
    normalized_name   latitude  longitude
325           Abdin  13.566941  29.637362
335       Abyssinia  10.211670  38.652120
248            Aden  12.789585  45.028504
247            Aden  12.789585  45.028504
253            Aden  12.789585  45.028504


In [12]:
# The below code in this cell is used specifically for this project, and
# does the same as the code above but only looks at the main stops from the text

import spacy
import pandas as pd
import re
from geopy.geocoders import Nominatim
import time

# Load spaCy's large model (better Namned Entity Recognition accuracy)
nlp = spacy.load('en_core_web_lg')

# Define the main stops in chronological order with the same spelling as they occur in the text
main_stops = [
    'Chicago',
    'Glaciers',
    'Vancouver',
    'Yokohama',
    'Niko',
    'Tokio',
    'Kobe',
    'Nagasaki',
    'Shanghai',
    'Hongkong',
    'Canton',
    'Saigon',
    'Singapore',
    'Kandy',
    'Calcutta',
    'Benares',
    'Aigra',
    'Delhi',
    'Jeypore',
    'Bombay',
    'Aden',
    'Ismalia',
    'Cairo',
    'Luxor',
    'Brindisi',
    'Marseilles',
    'Paris',
    'London',
    'New York'
]

# Initialize geolocator using Nominatim
geolocator = Nominatim(user_agent="ph223ed@student.lnu.se")

# Get latitude and longitude coordinates of location, throws error message if not found
def get_coordinates(location_name):
    try:
          # Add delay to the Nominatim API calls to respect API limits
        time.sleep(1)
        location = geolocator.geocode(location_name)
        if location:
            return (location.latitude, location.longitude)
        return None
    except Exception as e:
        print(f"Error getting coordinates for {location_name}: {e}")
        return None

def extract_locations_with_context(text):
    doc = nlp(text)
    locations = []

    # Process each sentence
    for sent in doc.sents:
        # Check if any main stop is mentioned in the sentence
        for location in main_stops:
            if location.lower() in sent.text.lower():
                # Get surrounding sentences for context
                sent_index = list(doc.sents).index(sent)
                start_index = max(0, sent_index - 1)
                end_index = min(len(list(doc.sents)), sent_index + 2)

                # Get context (2 sentences, one before and one after location mention)
                context_sents = list(doc.sents)[start_index:end_index]
                context = ' '.join([s.text.strip() for s in context_sents])

                # Get coordinates
                coordinates = get_coordinates(location)

                locations.append({
                    'original_name': location,
                    'normalized_name': location,  # We're using original spellings
                    'context': context,
                    'coordinates': coordinates
                })

    return locations

# Read the text file
with open('hunt_travel.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Get locations and store in a data frame
locations = extract_locations_with_context(text)
df = pd.DataFrame(locations)

# Split coordinates into separate latitude and longitude columns
df['latitude'] = df['coordinates'].apply(lambda x: x[0] if x else None)
df['longitude'] = df['coordinates'].apply(lambda x: x[1] if x else None)

# Drop the coordinates column
df = df.drop('coordinates', axis=1)

# Sort by the order in main_stops (to maintain chronological order)
df['sort_order'] = df['original_name'].map({name: i for i, name in enumerate(main_stops)})
df = df.sort_values('sort_order')
df = df.drop('sort_order', axis=1)

# Print the total number of location mentions
# Print sample of locations along with their extracted coordinates
print(f"Found {len(df)} location mentions")
print("\nSample of locations with coordinates:")
print(df[['original_name', 'normalized_name', 'latitude', 'longitude']].head())

# # Save name of location and its coordinates to a CSV file
df.to_csv('ext_loc_cord.csv', index=False)



Found 195 location mentions

Sample of locations with coordinates:
    original_name normalized_name   latitude  longitude
0         Chicago         Chicago  41.875562 -87.624421
79        Chicago         Chicago  41.875562 -87.624421
80        Chicago         Chicago  41.875562 -87.624421
127       Chicago         Chicago  41.875562 -87.624421
169       Chicago         Chicago  41.875562 -87.624421


In [13]:
import pandas as pd
from textblob import TextBlob

# Read the CSV file created in the above cell containing locations and their coordinates
df = pd.read_csv('/content/ext_loc_cord.csv')

# Calculate sentiment scores
def get_sentiment(text):
    blob = TextBlob(str(text))
    return blob.sentiment.polarity

# Add sentiment scores to data frame
df['sentiment_score'] = df['context'].apply(get_sentiment)

# Save all entries with their individual sentiment scores to new CSV file
detailed_sentiments = df[['original_name', 'normalized_name', 'context', 'sentiment_score', 'latitude', 'longitude']]
detailed_sentiments.to_csv('ind_loc_sentiments.csv', index=False)

# Calculate average sentiment for locations with multiple mentions,
# also calculates standard deviation of sentiment score and the location mention count
location_sentiments = df.groupby('normalized_name').agg({
    'sentiment_score': ['mean', 'count', 'std'],
    'latitude': 'first',
    'longitude': 'first'
}).round(3)

# Clean up the column names
location_sentiments.columns = ['avg_sentiment', 'mention_count', 'sentiment_std', 'latitude', 'longitude']
location_sentiments = location_sentiments.reset_index()

# Sort by average sentiment
location_sentiments = location_sentiments.sort_values('avg_sentiment', ascending=False)

# Save average sentiments to new CSV file
location_sentiments.to_csv('avg_loc_sentiments.csv', index=False)

# Calculate overall sentiment statistics; average sentiment, most positive and negative
overall_avg_sentiment = df['sentiment_score'].mean()
most_positive = location_sentiments.iloc[0]
most_negative = location_sentiments.iloc[-1]

# Print sentiment statistics
print("\nSentiment Analysis Summary:")
print(f"Average sentiment across all mentions: {overall_avg_sentiment:.3f}")
print(f"Most positive location: {most_positive['normalized_name']} ({most_positive['avg_sentiment']:.3f})")
print(f"Most negative location: {most_negative['normalized_name']} ({most_negative['avg_sentiment']:.3f})")


Sentiment Analysis Summary:
Average sentiment across all mentions: 0.155
Most positive location: Yokohama (0.334)
Most negative location: Brindisi (-0.123)


In [14]:
import pandas as pd

# Create a dictionary mapping locations to their chronological order
chronological_order = {
    'Chicago': 1,
    'Glaciers': 2,
    'Vancouver': 3,
    'Yokohama': 4,
    'Niko': 5,
    'Tokio': 6,
    'Kobe': 7,
    'Nagasaki': 8,
    'Shanghai': 9,
    'Hongkong': 10,
    'Canton': 11,
    'Saigon': 12,
    'Singapore': 13,
    'Kandy': 14,
    'Calcutta': 15,
    'Benares': 16,
    'Aigra': 17,
    'Delhi': 18,
    'Jeypore': 19,
    'Bombay': 20,
    'Aden': 21,
    'Ismalia': 22,
    'Cairo': 23,
    'Luxor': 24,
    'Brindisi': 25,
    'Marseilles': 26,
    'Paris': 27,
    'London': 28,
    'New York': 29
}

# Read the CSV file created in the above cell
df = pd.read_csv('avg_loc_sentiments.csv')

# Add chronological order column, used for mapping the order of stops in QGIS
df['chronological_order'] = df['normalized_name'].map(chronological_order)

# Sort by chronological order
df = df.sort_values('chronological_order')

# Save to new CSV file
df.to_csv('avg_loc_sentiments_ordered.csv', index=False)

# Print first few rows to verify
print(df[['normalized_name', 'chronological_order', 'avg_sentiment']].head())

   normalized_name  chronological_order  avg_sentiment
5          Chicago                    1          0.212
27        Glaciers                    2         -0.012
18       Vancouver                    3          0.126
0         Yokohama                    4          0.334
21            Niko                    5          0.094


In [15]:
import pandas as pd

# Create a dictionary with arrival and departure times
# Based on explicit or approximate arrival and departure times in the text
# For simplicity's sake, only takes into account first arrival and departure time
# (For instance, Cairo is being visisted twice, the second arrival and departure date is disregarded in the below code)
time_data = {
    'Chicago': {'arrival': '1895-08-18T00:00:00Z', 'departure': '1895-08-19T00:00:00Z'},
    'Glaciers': {'arrival': '1895-08-22T00:00:00Z', 'departure': '1895-08-23T00:00:00Z'},
    'Vancouver': {'arrival': '1895-08-24T00:00:00Z', 'departure': '1895-08-26T00:00:00Z'},
    'Yokohama': {'arrival': '1895-09-09T00:00:00Z', 'departure': '1895-09-10T00:00:00Z'},
    'Niko': {'arrival': '1895-09-11T00:00:00Z', 'departure': '1895-09-12T00:00:00Z'},
    'Tokio': {'arrival': '1895-09-13T00:00:00Z', 'departure': '1895-09-14T00:00:00Z'},
    'Kobe': {'arrival': '1895-09-17T00:00:00Z', 'departure': '1895-09-18T00:00:00Z'},
    'Nagasaki': {'arrival': '1895-10-01T00:00:00Z', 'departure': '1895-10-02T00:00:00Z'},
    'Shanghai': {'arrival': '1895-10-04T00:00:00Z', 'departure': '1895-10-07T00:00:00Z'},
    'Hongkong': {'arrival': '1895-10-08T00:00:00Z', 'departure': '1895-10-09T00:00:00Z'},
    'Canton': {'arrival': '1895-10-10T00:00:00Z', 'departure': '1895-10-11T00:00:00Z'},
    'Saigon': {'arrival': '1895-10-19T00:00:00Z', 'departure': '1895-10-20T00:00:00Z'},
    'Singapore': {'arrival': '1895-10-22T00:00:00Z', 'departure': '1895-10-23T00:00:00Z'},
    'Kandy': {'arrival': '1895-10-30T00:00:00Z', 'departure': '1895-11-01T00:00:00Z'},
    'Calcutta': {'arrival': '1895-11-05T00:00:00Z', 'departure': '1895-11-17T00:00:00Z'},
    'Benares': {'arrival': '1895-11-18T00:00:00Z', 'departure': '1895-11-19T00:00:00Z'},
    'Aigra': {'arrival': '1895-11-20T00:00:00Z', 'departure': '1895-11-21T00:00:00Z'},
    'Delhi': {'arrival': '1895-11-22T00:00:00Z', 'departure': '1895-11-23T00:00:00Z'},
    'Jeypore': {'arrival': '1895-11-28T00:00:00Z', 'departure': '1895-12-02T00:00:00Z'},
    'Bombay': {'arrival': '1895-12-04T00:00:00Z', 'departure': '1895-12-16T00:00:00Z'},
    'Aden': {'arrival': '1895-12-17T11:00:00Z', 'departure': '1895-12-17T23:00:00Z'},
    'Ismalia': {'arrival': '1895-12-21T12:30:00Z', 'departure': '1895-12-21T18:00:00Z'},
    'Cairo': {'arrival': '1895-12-22T18:00:00Z', 'departure': '1895-12-24T00:00:00Z'},
    'Luxor': {'arrival': '1896-01-01T00:00:00Z', 'departure': '1896-01-18T00:00:00Z'},
    'Brindisi': {'arrival': '1896-02-01T00:00:00Z', 'departure': '1896-02-02T00:00:00Z'},
    'Marseilles': {'arrival': '1896-02-07T00:00:00Z', 'departure': '1896-02-08T00:00:00Z'},
    'Paris': {'arrival': '1896-02-09T00:00:00Z', 'departure': '1896-02-15T00:00:00Z'},
    'London': {'arrival': '1896-02-16T00:00:00Z', 'departure': '1896-02-28T00:00:00Z'},
    'New York': {'arrival': '1896-03-10T00:00:00Z', 'departure': '1896-03-11T00:00:00Z'}
}

# Read CSV file created in the above cell (does not contain arrival departure time)
df = pd.read_csv('avg_loc_sentiments_ordered.csv')

# Add arrival and departure time columns to the dataset
df['arrival_time'] = df['normalized_name'].map(lambda x: time_data[x]['arrival'] if x in time_data else None)
df['departure_time'] = df['normalized_name'].map(lambda x: time_data[x]['departure'] if x in time_data else None)

# Save updated dataset to new CSV file
df.to_csv('loc_with_times.csv', index=False)