# Mapping places mentionned in a poem - an application of Natural Language Processing


Created on Wed Mar 14 15:00:15 2018

@author: Natacha Chenevoy

This script:
- Extracts the raw text from The Waste Land by T. S. Eliot from an html format
- Performs Part of Speech tagging to extract proper nouns
- Querries the names in google maps to get the corresponding lat/lon 
(providing the proper noun corresponds to a place name)
- Displays the places on an interractive map using Bokeh


### Extracts the raw text from The Waste Land by T. S. Eliot from an html format 


In [None]:
import requests
import nltk
from bs4 import BeautifulSoup
nltk.download("punkt")

# Read HTML page
url = "http://www.gutenberg.org/files/1321/1321-h/1321-h.htm"
html = requests.get(url).text
raw = BeautifulSoup(html, "lxml").get_text()	# Without tags

# Cut down text
start = "il miglior fabbro"
start_pos = raw.find(start) + len(start) 
end_pos = raw.rfind("Line 415 aetherial] aethereal")
raw = raw[start_pos:end_pos]

# Tokenising
tokens = nltk.word_tokenize(raw)


text = nltk.Text(tokens)
len(text) #Number of words

# 20 most common words
fdist = nltk.FreqDist(text)
print(fdist.most_common(20))
fdist.plot(50, cumulative=True)


# 20 most common word length
fdist =  nltk.FreqDist(len(w) for w in text)
print(fdist.most_common(20))


# All unique words over 10 letters long 
sorted_words = sorted(set(text))   #Duplicates collapsed by set.
long_words = [w for w in sorted_words if len(w) > 10]
print(long_words)


### Perform Part of Speech tagging to extract proper nouns

In [None]:
# Part of Speech tagging
nltk.download('averaged_perceptron_tagger')                
tagged = nltk.pos_tag(text)                    

# Extract Proper Noun and remove false positive tags (numbers and uppercase words)
proper_nouns = []
for tag in tagged:
    if tag[1] == "NNP" and tag[0].isalpha() and not tag[0].isupper():
        proper_nouns.append(tag[0])

### Querry google maps for lat/lon of places in poem

In [None]:
import time

# Replace the value below with your personal API key:
mykey = ""

GOOGLE_MAPS_API_URL = 'https://maps.googleapis.com/maps/api/geocode/json'

import pandas as pd
df = pd.DataFrame()

for noun in proper_nouns:
    
    params = {
        'address': noun,
        'key' : mykey
    }
    
    # Do the request and get the response data
    req = requests.get(GOOGLE_MAPS_API_URL, params=params)
    res = req.json()
    
    if res['results']:
        
        # Use the first result
        result = res['results'][0]
    
        geodata = dict()
        geodata['lat'] = result['geometry']['location']['lat']
        geodata['lng'] = result['geometry']['location']['lng']
        geodata['address'] = result['formatted_address']
        geodata['name'] = noun
    
        print('{address}. (lat, lng) = ({lat}, {lng})'.format(**geodata))
        # 221B Baker Street, London, Greater London NW1 6XE, UK. (lat, lng) = (51.5237038, -0.1585531)
        df = df.append(geodata, ignore_index = True)
        # Wait for 5 seconds
        time.sleep(5)

df.to_csv("places.csv")

### Create interractive map with Bokeh

In [7]:
# Need to install pyproj (using conda or pip)


from pyproj import Proj, transform #  for projection from lat/long to mercator
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, Range1d, PanTool, WheelZoomTool, BoxSelectTool, HoverTool
)

import os
import pandas as pd

# os.chdir("//ds.leeds.ac.uk/staff/staff19/mednche/GitHub/AdvancedProgrammingSkills/NLP")
os.chdir("/Users/natachachenevoy/Documents/AdvancedProgrammingSkills/NLP")

# open list of places found on google API
places = pd.read_csv("places.csv", encoding="cp1252")

# defining the tools
hover = HoverTool(tooltips=[
    ("name", "@name"),
    ("address", "@address"),
    ("(lat, long)", "(@lat, @long)"),
])

wheel_zoom = WheelZoomTool()
wheel_zoom = PanTool()


bound = 20000000 # meters

#### Figure ####
# setting the active tools 
fig = figure(tools=[hover, wheel_zoom], x_range=(-bound, bound), y_range=(-bound, bound))
fig.axis.visible = False
fig.add_tile(CARTODBPOSITRON)
fig.title.text = "Map of places in poem"

longitude  = places.lng
latitude = places.lat


inProj = Proj(init='epsg:4326')
outProj = Proj(init='epsg:3857')

list_lng,list_lat  = [],[]
for index, place in places.iterrows():
    
    longitude , latitude = place.lng, place.lat
    longitude2,latitude2 = transform(inProj,outProj,longitude,latitude)
    
    list_lng.append(longitude2)
    list_lat.append(latitude2)

    
d = {'name': places.name, 'address': places.address, 'X': list_lng, 'Y': list_lat, 'lat': places.lat, 'long': places.lng}
source = ColumnDataSource(d)

circle = fig.circle(x='X', y='Y', alpha=0.9, source=source)

output_notebook()
show(fig)