# Scraper institutes

This notebook is a monthly scraper used to retrieve information about condition in detention centers in Italy. To do so, it uses the id numbers of the various detention centers to navigate to the dedicated webpages with Selenium, store locally the html code of the page and then parse it using BeautifuSoup. The information is then stored in a pandas dataframe and saved as a csv file.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import datetime
import re

from pathlib import Path
from random import randint
import time

from seleniumwire import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By

In [2]:
def open_browser():
    """
    Opens a new automated browser window with all tell-tales of automated browser disabled
    """
    options = Options()
    options.add_argument("--start-maximized")

    # Enable headless mode
    # options.add_argument    
    options.add_argument("--headless")

    # remove all signs of this being an automated browser
    options.set_preference("dom.webdriver.enabled", False)
    options.set_preference('useAutomationExtension', False)
    options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0")

    # open the browser with the new options
    driver = webdriver.Firefox(options=options)
    return driver

def extract_prison_data(soup, prison_id, prison_all_data):
    # Grab the name using the class titoloIstituto
    name = soup.find('h1', {'class': 'titoloIstituto'}).text.strip()
    institute_type = soup.find('h3', {'class': 'titoloIstituto'}).text.strip()

    # Find the h2 element with the specific text
    h2 = soup.find('h2', text='dati aggiornati al ')
    
    try:
        # Get the next span sibling
        span = h2.find_next_sibling('span')
        # Extract the text from the span element
        date_last_update = span.text.strip()

        # Grab the table with the occupancy data
        table = soup.find('table')
        rows = table.find_all('tr')
        prison_data = [[prison_id, name, institute_type, date_last_update] + [td.text.strip() for td in row.find_all('td')] for row in rows[1:]]

    except:
        prison_data = [[prison_id, name, institute_type]]

    prison_all_data.extend(prison_data)

    return prison_all_data

def extract_staff_data(soup, prison_id, staff_all_data):
     # Get information about Direttore
    h2 = soup.find('h2', text='Direttore')
    try:
        div = h2.find_next('div', {'class': 'listaContenutiComplessi'})
        spans = div.find_all('span', {'class': 'valoreSottocampo'})
        # Get the first name, last name, and role
        first_name = spans[0].text.strip()
        last_name = spans[1].text.strip()
        role = spans[2].text.strip()
    except:
        first_name = 'NA'
        last_name = 'NA'
        role = 'NA'

    # Get information about Responsabile ASL per il carcere
    h2 = soup.find('h2', text='Responsabile ASL per il carcere')
    try:
        div = h2.find_next('div', {'class': 'listaContenutiComplessi'})
        spans = div.find_all('span', {'class': 'valoreSottocampo'})
        # Get the first name, last name, and role
        asl = spans[0].text.strip()
        first_name_asl = spans[1].text.strip()
        last_name_asl = spans[2].text.strip()
    except:
        asl = 'NA'
        first_name_asl = 'NA'
        last_name_asl = 'NA'


    # Parse table information about the prison staff
    table = h2.find_next('table')
    rows = table.find_all('tr')

    # Define the expected headers
    expected_headers = [
        'polizia penitenziaria - effettivi',
        'polizia penitenziaria - previsti',
        'amministrativi - effettivi',
        'amministrativi - previsti',
        'educatori - effettivi',
        'educatori - previsti'
    ]

    # Extract the headers from the first row of the table
    headers = [th.text.strip() for th in rows[0].find_all('th')]

    # Check if the headers match the expected headers
    if headers == expected_headers:
        # If the headers match, parse the table
        staff_data = [[prison_id, first_name, last_name, role, asl, first_name_asl, last_name_asl] + [td.text.strip() for td in row.find_all('td')] for row in rows[1:]]
    else:
        print("Unexpected table headers")

    # Get the date of the last police staff update
    h2 = soup.find('h2', text='personale polizia penitenziaria aggiornato al')
    try:
        date_element = h2.find_next_sibling()
        date_personale_polizia = date_element.text.strip()
    except:
        date_personale_polizia = 'NA'

    # Get the date of the last staff administrative update
    h2 = soup.find('h2', text='personale amministrativo aggiornato al')
    try:
        date_element = h2.find_next_sibling()
        date_personale_amministrativo = date_element.text.strip()
    except:
        date_personale_amministrativo = 'NA'

    # Add date_personale_polizia and date_personale_amministrativo to each row
    for row in staff_data:
        row.append(date_personale_polizia)
        row.append(date_personale_amministrativo)

    print("################")

    staff_all_data.extend(staff_data)

    return staff_all_data


In [3]:
# Collect institutes id numbers
prisons_df = pd.read_csv('../outputs/clean/istituti_penitenziari.csv')
prison_ids = prisons_df['id_istituto'].tolist()

# Get current month
current_month = datetime.datetime.now().strftime("%Y-%m")

# Initialize lists to store data
prison_all_data = []
staff_all_data = []

for prison_id in prison_ids:

    dest = Path(f"../outputs/raw/istituti/{prison_id}_{current_month}.html")

    if dest.exists() : #... load it from file
        print(f"Already have {dest}, loading!")
        page_html = open(dest).read()

    else:
        MAX_RETRIES = 5

        for i in range(MAX_RETRIES):
            try:
                BASE_URL = "https://www.giustizia.it/giustizia/it/dettaglio_scheda.page?s="
                page_url = BASE_URL + prison_id
                print("Fetching " + page_url)
                
                # Open up the browser
                driver = open_browser()
                print("Opening up the browser")

                # Go to the page, get source code
                driver.get(page_url)
                page_html = driver.page_source

                with open(dest, 'w') as f:
                    f.write(page_html)
                    print(f"Page saved in {dest}. Extracting data.")
                break
            except Exception as e:
                print(f"Error occurred: {e}. Retrying in 10 seconds.")
                sleep(10)

        # Close the browser
        driver.quit()

    # Parse the html with BeautifulSoup
    soup = BeautifulSoup(page_html, 'html.parser')

    prison_all_data = extract_prison_data(soup, prison_id, prison_all_data)
    staff_all_data = extract_staff_data(soup, prison_id, staff_all_data)

Already have ../outputs/raw/istituti/MII179988_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII172610_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII172827_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII172320_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII173712_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII173747_2024-03.html, loading!


  h2 = soup.find('h2', text='dati aggiornati al ')
  h2 = soup.find('h2', text='Direttore')
  h2 = soup.find('h2', text='Responsabile ASL per il carcere')
  h2 = soup.find('h2', text='personale polizia penitenziaria aggiornato al')
  h2 = soup.find('h2', text='personale amministrativo aggiornato al')


################
Already have ../outputs/raw/istituti/MII177436_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII178027_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII178072_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII178659_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII179237_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII182910_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII181346_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII181703_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII181924_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII179733_2024-03.html, loading!
################
Already have ../outputs/raw/istituti/MII173311_2024-03.html, loading!
################
Already have ../outputs/ra

In [4]:
# convert prison_all_data to df
cols = [
    'id_istituto',
    'nome_istituto',
    'tipo_istituto',
    'dati_aggiornati_al',
    'posti_regolamentari',
    'posti_non_disponibili',
    'totale_detenuti'
    ]

df_prison = pd.DataFrame(prison_all_data, columns=cols)



# Convert staff_all_data to df
cols = [
    'id_istituto',
    'direttore_nome',
    'direttore_cognome',
    'direttore_ruolo',
    'asl',
    'responsabile_asl_nome',
    'responsabile_asl_cognome',
    'polizia_penitenziaria_effettivi',
    'polizia penitenziairia_previsti',
    'amministrativi_effettivi',
    'amministrativi - previsti',
    'educatori_effettivi',
    'educatori_previsti',
    'personale_polizia_penitenziaria_aggiornato_al',
    'personale_amministrativo_aggiornato_al']

df_staff = pd.DataFrame(staff_all_data, columns=cols)

# Merge df_prison and df_staff on prison_id
df = pd.merge(df_prison, df_staff, on='id_istituto')

In [5]:
# Add date of extraction
df['data_extracted_on'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

PATH = '../outputs/clean/istituti_penitenziari_data.csv'

# If PATH doesn't exist, create it and save the data
if not Path(PATH).exists():
    df.to_csv(PATH, index=False, encoding='utf-8-sig')
#If it exists, load the data and append the new data
else:
    old_df = pd.read_csv(PATH)
    new_df = pd.concat([old_df, df], axis=0)
    new_df.to_csv(PATH, index=False, encoding='utf-8-sig')

