<a href="https://colab.research.google.com/github/nonoumasy/War-and-Peace-locations/blob/master/War_and_Peace_locations_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project Goals:

Extract Place Entities (Cities, Locations, etc) from any document(novel, news articles, etc) and plot them on a Map.

User Flow:

- Enter document(pdf, text file, doc, etc) 
- Extract Locations
- Find Long and Lat values for location entities extracted 
- Create WordCloud of Locations
- Plot those coordinates on a map




In [11]:
import pandas as pd 
import folium
from folium import plugins
import string
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
#load cities
cities_df = pd.read_csv('https://raw.githubusercontent.com/datasets/world-cities/master/data/world-cities.csv')
#geo_df = geo_df[['name','country']]

In [3]:
cities_df.sample(20)

Unnamed: 0,name,country,subcountry,geonameid
6325,Manresa,Spain,Catalonia,3117533
21267,Rutherford,United States,New Jersey,5103580
13983,Ciudad de Huajuapan de León,Mexico,Oaxaca,3527023
709,Kālia,Bangladesh,Khulna,1185272
21718,Atwater,United States,California,5325187
22725,Hà Đông,Vietnam,Ha Nội,1581364
10393,Gubbi,India,Karnataka,1270824
12097,Safi,Jordan,Karak,250267
2109,Canguçu,Brazil,Rio Grande do Sul,3467512
17994,Abqaiq,Saudi Arabia,Eastern Province,107312


In [0]:
#known_locations |= set(geo_df['name'])y=''
known_locations = set(cities_df['name'])

#stop words
stop_words = set(stopwords.words('english'))

#punctuation
punct_table = str.maketrans('', '', string.punctuation)
def punct_strip(text):
    return text.translate(punct_table)

def get_words(text):
    tokens = word_tokenize(text)
    stripped = (punct_strip(t) for t in tokens)
    words = [w for w in stripped if not w in stop_words]
    return words

loc_counter = Counter()

with open('/content/tolstoy.txt', 'r') as f:
    lines = [f.readline() for _ in range(200000)]
    for line in lines:
        words = get_words(line)
        loc_counter.update(w for w in words if w in known_locations)

In [32]:
loc_counter

Counter({'Alliance': 4,
         'Along': 4,
         'Amstetten': 1,
         'Anna': 293,
         'Asia': 1,
         'Baden': 2,
         'Bear': 1,
         'Berlin': 7,
         'Bismarck': 2,
         'Bordeaux': 3,
         'Bremen': 1,
         'Bucharest': 2,
         'Central': 2,
         'Columbus': 1,
         'Constantine': 3,
         'Copenhagen': 1,
         'Date': 1,
         'David': 5,
         'Dover': 1,
         'Dresden': 3,
         'Elizabeth': 2,
         'Erfurt': 6,
         'Essen': 1,
         'Fairbanks': 1,
         'Frederick': 4,
         'Genoa': 3,
         'George': 31,
         'Hamburg': 2,
         'Helena': 3,
         'Hercules': 2,
         'Ho': 3,
         'Imperial': 13,
         'Jena': 5,
         'Jerusalem': 4,
         'Kari': 1,
         'Kiev': 17,
         'Kursk': 1,
         'Liberty': 1,
         'Lichtenfels': 1,
         'Linz': 1,
         'Lodi': 1,
         'Lombard': 1,
         'London': 1,
         'Lucca': 2,
        

In [0]:
#get lontitude and latitude for cities
geolocator = Nominatim(user_agent="xray", timeout=5)

def lat_lon(city):
    loc = geolocator.geocode(city)
    if loc is None:
        raise AttributeError(f'city not found -- {city}')
    return {'city': city, 
            'lat': loc.latitude,
            'lon': loc.longitude}

coordinates = []
for city in loc_counter:
    try:
        coordinates.append(lat_lon(city))
    except AttributeError:
        pass
    
loc_df = pd.DataFrame(coordinates)

In [24]:
loc_df.city

0            Date
1           David
2           Genoa
3           Lucca
4            Anna
5          Vienna
6            Mary
7          Moscow
8           Paris
9          George
10      Elizabeth
11       Imperial
12            Mon
13        Liberty
14        Semënov
15       Hercules
16          Dover
17             Of
18           Most
19     Providence
20      Stralsund
21         Naples
22    Marlborough
23             Ho
24      Frederick
25           York
26        Lombard
27          Along
28            Ulm
29          Traun
30           Linz
31      Amstetten
32     Copenhagen
33    Lichtenfels
34         Berlin
35         London
36         Toulon
37           Kiev
38          March
39           Kari
40        Potsdam
41            Man
42    Constantine
43           Much
44          Essen
45          Young
46           Nice
47          Kursk
48          Mason
Name: city, dtype: object

In [25]:
from folium.plugins import FastMarkerCluster
# plot cities on map

m = folium.Map(
    location= [55, 36],
    zoom_start=5,
    min_zoom=5,
    no_wrap=True,
    tiles = "Stamen Watercolor",
    crs='EPSG3857'
    )

loc_list = loc_df[['lat', 'lon']].values.tolist()

for i in range(len(loc_df)):
    folium.CircleMarker(location=loc_list[i], 
                        radius=10,
                        fill=True,
                        color='#40B8AF',
                        popup=loc_df.city[i],
                        ).add_to(m)

folium.plugins.Fullscreen(position='bottomright').add_to(m)

#m.save('tolstoy.html')
m

In [0]:
loc_df

Unnamed: 0,city,lat,lon
0,Genoa,44.40726,8.933862
1,Lucca,43.842838,10.502876
2,Anna,10.416667,77.666667
3,Vienna,48.208354,16.372504
4,Mary,37.594136,61.839767
5,Moscow,55.750446,37.617494
6,Paris,48.85661,2.351499
7,George,-33.964444,22.459722
8,Elizabeth,40.663992,-74.210701
9,Imperial,33.030549,-115.359567


In [0]:
geolocator = Nominatim(user_agent="xray", timeout=5)
geolocator.geocode('brno')

Location(Brno, okres Brno-město, Jihomoravský kraj, Jihovýchod, Česko, (49.1922443, 16.6113382, 0.0))

<_io.TextIOWrapper name='/Users/nonoumasy/Downloads/data/tolstoy.txt' mode='rt' encoding='UTF-8'>

In [0]:
with open('/Users/nonoumasy/Downloads/data/tolstoy.txt', 'rt') as f:
    lines = [f.readline() for _ in range(20000)]
    for line in lines:
        words = get_words(line)
        words.

AttributeError: 'generator' object has no attribute 'str'

In [0]:
loc_counter

Counter({'Genoa': 3,
         'Lucca': 2,
         'Anna': 238,
         'Vienna': 38,
         'Mary': 153,
         'Moscow': 87,
         'Paris': 9,
         'George': 7,
         'Elizabeth': 2,
         'Imperial': 7,
         'Mon': 5,
         'Liberty': 1,
         'Semënov': 4,
         'Hercules': 1,
         'Dover': 1,
         'Of': 20,
         'Most': 6,
         'Providence': 1,
         'Stralsund': 1,
         'Naples': 1,
         'Marlborough': 1,
         'Ho': 2,
         'Frederick': 3,
         'York': 1,
         'Lombard': 1,
         'Along': 3,
         'Ulm': 12,
         'Traun': 1,
         'Linz': 1,
         'Amstetten': 1,
         'Copenhagen': 1,
         'Lichtenfels': 1,
         'Berlin': 3,
         'London': 1,
         'Toulon': 4,
         'Kiev': 6,
         'March': 6,
         'Kari': 1,
         'Potsdam': 3,
         'Man': 1,
         'Constantine': 1,
         'Much': 3,
         'Essen': 1,
         'Young': 3,
         'Nice': 1,
   

In [0]:
words

['fine', 'speeches', 'rude', 'savage', 'way', 'throws']

In [0]:
geo_df

Unnamed: 0,name,country
0,les Escaldes,Andorra
1,Andorra la Vella,Andorra
2,Umm al Qaywayn,United Arab Emirates
3,Ras al-Khaimah,United Arab Emirates
4,Khawr Fakkān,United Arab Emirates
5,Dubai,United Arab Emirates
6,Dibba Al-Fujairah,United Arab Emirates
7,Dibba Al-Hisn,United Arab Emirates
8,Sharjah,United Arab Emirates
9,Ar Ruways,United Arab Emirates
