In [1]:
import pandas as pd 
import re 
import requests
import json
import numpy as np 
import pandas as pd
from bs4 import BeautifulSoup


# Part 1 - Scrape Wikipedia Page of Paris Monuments

In [2]:
wiki_monuments = requests.get("https://fr.wikipedia.org/wiki/Monuments_et_sites_de_Paris")

In [3]:
html = wiki_monuments.text
soup = BeautifulSoup(html, 'html.parser')

In [4]:
#categories for types of monuments
categories =[a.get_text() for a in soup.select("#mw-content-text > div.mw-parser-output > h3")] 


In [5]:
#Name of monuments --> returns 1 list of monuments per category
monuments = [a.get_text() for a in soup.select("#mw-content-text > div.mw-parser-output > ul")]

In [6]:
# Only using first 11 categories so that we don't use too much API calls later for the geocode
monuments = monuments[0:10]
categories = categories[0:10]

In [7]:
# put scraped data into DF 
paris_dict = {"monument": monuments, "category": categories}
paris = pd.DataFrame(paris_dict)

In [8]:
pd.set_option("max_colwidth", 500)

In [9]:
paris.head(2)

Unnamed: 0,monument,category
0,Arc de triomphe du Carrousel\nArc de triomphe de l'Étoile\nArche de la Défense\nPorte Saint-Denis\nPorte Saint-Martin,Arcs et arches[modifier | modifier le code]
1,"Colonne Médicis\nColonne de Juillet, place de la Bastille\nColonne Vendôme, place Vendôme\nObélisque de Louxor, place de la Concorde\nBarrière du Trône, avenue du Trône",Colonnes et obélisques[modifier | modifier le code]


In [10]:
#transform string into list of monuments that were separated with \n in html code
paris["monument"] = paris["monument"].apply(lambda x: x.split("\n"))

In [11]:
paris.head(2)

Unnamed: 0,monument,category
0,"[Arc de triomphe du Carrousel, Arc de triomphe de l'Étoile, Arche de la Défense, Porte Saint-Denis, Porte Saint-Martin]",Arcs et arches[modifier | modifier le code]
1,"[Colonne Médicis, Colonne de Juillet, place de la Bastille, Colonne Vendôme, place Vendôme, Obélisque de Louxor, place de la Concorde, Barrière du Trône, avenue du Trône]",Colonnes et obélisques[modifier | modifier le code]


In [12]:
# explode function basically takes all elements of a list and puts them in their own row
paris = paris.explode("monument")


In [13]:
# explode function has an index argument but for some reason did not work, so resetting index here
paris = paris.reset_index(drop= True)

In [14]:
# need to clean the categories string in order to take out the weird HTML text. \\ allows escape of regex for "["
paris["category"] = paris["category"].str.split('\\[modifier').str[0]

In [16]:
# and here is our fairly clean DF --> we could further split monument string but not necessary for now
paris.head(20)

Unnamed: 0,monument,category
0,Arc de triomphe du Carrousel,Arcs et arches
1,Arc de triomphe de l'Étoile,Arcs et arches
2,Arche de la Défense,Arcs et arches
3,Porte Saint-Denis,Arcs et arches
4,Porte Saint-Martin,Arcs et arches
5,Colonne Médicis,Colonnes et obélisques
6,"Colonne de Juillet, place de la Bastille",Colonnes et obélisques
7,"Colonne Vendôme, place Vendôme",Colonnes et obélisques
8,"Obélisque de Louxor, place de la Concorde",Colonnes et obélisques
9,"Barrière du Trône, avenue du Trône",Colonnes et obélisques


# Part 2 - Geolocation API

In [17]:
# defining API settings
api_key = 'your key'
address = ""
base_url = 'https://maps.googleapis.com/maps/api/geocode/json?address='


In [18]:
# function that gives an URL for the API we need according to address 
def give_url(address):
    url = base_url + address + '&key=' + api_key
    url = re.sub('\\s', '+', url)
    return url 

In [19]:
# function that returns longtidue of input adress  
def get_longitude(address): 
    try:
        url = base_url + address + '&key=' + api_key
        url = re.sub('\\s', '+', url)
        location_data = requests.get(url)
        location_data = json.loads(location_data.text)
        longitude = location_data['results'][0]['geometry']['location']['lng']
        longitude = float (longitude)
        
    # error handling 
    except TypeError:
        longitude = np.NaN
    except IndexError:
        longitude = np.NaN
    return (longitude)

In [20]:
# same as above for latitude
def get_latitude(address): 
    try:
        url = base_url + address + '&key=' + api_key
        url = re.sub('\\s', '+', url)
        location_data = requests.get(url)
        location_data = json.loads(location_data.text)
        latitude = location_data['results'][0]['geometry']['location']['lat']
        latitude = float(latitude)
    
    except TypeError:
        latitude = np.NaN
    except IndexError:
        latitude = np.NaN
        
    return (latitude)

In [21]:
# applying function to our DF --> returns geocoordinates of our monuments
paris["longitude"] = paris.apply(lambda row: get_longitude(row['monument']), axis=1)
paris["latitude"] = paris.apply(lambda row: get_latitude(row['monument']), axis=1)

In [22]:
paris.head()

Unnamed: 0,monument,category,longitude,latitude
0,Arc de triomphe du Carrousel,Arcs et arches,2.332895,48.861713
1,Arc de triomphe de l'Étoile,Arcs et arches,2.295028,48.873792
2,Arche de la Défense,Arcs et arches,2.236112,48.892598
3,Porte Saint-Denis,Arcs et arches,2.352695,48.869805
4,Porte Saint-Martin,Arcs et arches,2.356311,48.869065


# Part 3 - Plotting into a map 

In [23]:
import plotly.express as px
#plotly express allows return of easily plotable figures w/o much code 
#javascript line bellow allows to increase output size in notebook

In [24]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999; 




<IPython.core.display.Javascript object>

In [27]:

fig = px.scatter_mapbox(paris, title ="Map of Paris Monuments", 
                        width=1000, height = 600, 
                        lat="latitude", lon="longitude", 
                        color="category", hover_name = "monument",
                        zoom=11, mapbox_style="stamen-toner", 
                        center = {"lat": 48.8530, "lon" : 2.330}) #this line allows to better center the map
fig.show()