In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import requests
from bs4 import BeautifulSoup

In [2]:
def prison_data_getter():
    '''gets all the data about federal prisons on wikipedia and then scrapes their location
    saving this data as a csv to enable manual addition of the ones that, for whatever reason
    don't have proper location data on wikipedia.
    '''
    
    url = "https://en.wikipedia.org/wiki/List_of_United_States_federal_prisons"
    wiki = 'https://en.wikipedia.org'
    api = 'https://en.wikipedia.org/w/api.php'

    res = requests.get(url)
    soup = BeautifulSoup(res.text)

    tables = soup.find_all('table')[:4]

    prisons = []
    for table in tables:
        prisons += [{'name': p['title'],
                     'link':wiki+p['href']}
                     for p in table.find_all('a')]
    print(prisons[:5])

    for prison in prisons:
        params = {
            "action": "query",
            "format": "json",
            "prop": "coordinates",
            "titles": prison['name'],
            "formatversion": "2"
        }

        r = requests.get(api, params=params)
        prison_data = r.json()['query']['pages'][0]

        if 'coordinates' in prison_data.keys():
            prison['lat'] = prison_data['coordinates'][0]['lat']
            prison['lon'] = prison_data['coordinates'][0]['lon']

    df = pd.DataFrame(prisons).set_index('name')

    df.to_csv('prisons.csv')
    
def prison_geojson_maker():
    '''turns the csv created above into a gsojson with the locations saved as points
    '''
    df = pd.read_csv('prisons.csv').set_index('name')
    gdf = gpd.GeoDataFrame(df,
        crs={'init': 'epsg:4326'},
        geometry=gpd.points_from_xy(df.lon, df.lat))

    gdf.to_file("../data/federal_prisons.geojson", driver='GeoJSON')
    
    
# To generate the database uncomment the below lines. After running the first line you 
# need to open the csv file and manually add the lat and lon 4 or 5 prisons that for
# whatever reasons don't download properly, but they're easily gettable from google 


# prison_data_getter()
# prison_geojson_maker()


In [8]:
prisons = gpd.read_file("../data/federal_prisons.geojson")

-76.927