# Scraper institutes

This notebook is a scraper used to retrieve generic information about detention centers in Italy (such as name, code, address, geographic coordinates). To do so, it uses the information stored at the webpage https://www.giustizia.it/giustizia/page/it/istituti_penitenziari. Because the information is loaded as javascript, we first use Selenium to save the code and then we use BeautifulSoup tand RegEx to parse the information.

In [1]:
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
import re
import html
import datetime

In [None]:
# "Hey, open up a browser"
# playwright = await async_playwright().start()
# browser = await playwright.chromium.launch(headless=False)
playwright = await async_playwright().start()
browser = await playwright.firefox.launch()
context = await browser.new_context(viewport={'width': 1280, 'height': 800})

# Create a new browser window
page = await context.new_page()

# Tell it to go to this page
print("Going to page...")
await page.goto("https://www.giustizia.it/giustizia/page/it/istituti_penitenziari", timeout=60000)

html_content = await page.content()
print("Got html content of the page, closing the browser and parsing it with BeautifulSoup")
await browser.close()

# Parsing the code with BeautifuSoup and RegEx
doc = BeautifulSoup(html_content)
# Find all institute marks
marks = re.findall(r"marker\.codiceIstituto.*?push", str(doc), re.DOTALL)
institutes_info = []
n = 0

for mark in marks:
    try:
        codice_istituto = re.search(r"marker\.codiceIstituto\s*=\s*\"(.*?)\";", mark).group(1)
        print(codice_istituto)
        title = re.search(r"marker\.title\s*=\s*\"(.*?)\";", mark).group(1)
        title = html.unescape(title)
        print(title)
        tipo = re.search(r"marker\.tipo\s*=\s*\"(.*?)\";", mark).group(1)
        lat = re.search(r"marker\.lat\s*=\s*(.*?);", mark).group(1)
        lon = re.search(r"marker\.lon\s*=\s*(.*?);", mark).group(1)
        provv_id = re.search(r"marker\.provv\s*=\s*{id:\s*(\d+),", mark).group(1)
        provv_name = re.search(r"marker\.provv\s*=\s*{id:\s*\d+,\s*name:\s*\"(.*?)\"}", mark).group(1)
        indirizzo = re.search(r"marker\.indirizzo\s*=\s*\"(.*?)\";", mark).group(1)
        telefono = re.search(r"marker\.telefono\s*=\s*\"(.*?)\";", mark).group(1)
        mii = re.search(r"marker\.mii\s*=\s*\'(.*?)\'", mark).group(1)


        institute_info = {
            "id_istituto": mii,
            "codice_istituto" : codice_istituto,
            "nome_istituto" : title,
            "tipo_istituto" : tipo,
            "latitudine" : lat,
            "longitude" : lon,
            "id_provveditorato" : provv_id,
            "nome_provveditorato" : provv_name,
            "indirizzo" : indirizzo,
            "telefono" : telefono
        }

        institutes_info.append(institute_info)
        print(f"mark scraped number {n}")
        n += 1
        print("####")

    except AttributeError:
        continue
    
print("Job finished, saving data to df")
df = pd.DataFrame(institutes_info)

In [None]:
df.head()

## Some (very basic) cleaning

In [None]:
df['tipo_istituto'].value_counts()

In [6]:
# Cleaning institute names
df['tipo_istituto'] = df['tipo_istituto'].str.strip()
df['tipo_istituto'] = df['tipo_istituto'].str.replace("-$", "", regex=True).str.strip()

In [None]:
df['tipo_istituto'].value_counts()

In [8]:
df.to_csv(f'../outputs/clean/institutes_info.csv', index=False)