# Scraper institutes
This notebook is a monthly scraper used to retrieve information about condition in detention centers in Italy. To do so, it uses the id numbers of the various detention centers to navigate to the dedicated webpages with Selenium, store locally the html code of the page and then parse it using BeautifuSoup. The information is then stored in a pandas dataframe and saved as a csv file.

In [1]:
import pandas as pd
import requests
import datetime
from bs4 import BeautifulSoup
import asyncio
from playwright.async_api import async_playwright
from time import sleep
from pathlib import Path

In [2]:
# Get current date
current_month = datetime.datetime.now().strftime("%Y-%m")
current_day = datetime.datetime.now().strftime("%Y-%m-%d")

In [3]:
# Collect institutes id numbers
df_institutes = pd.read_csv(f'../outputs/clean/istituti_info.csv')
prison_ids = df_institutes['id_istituto'].tolist()

In [4]:
# Function to grab the html code of the page
async def get_html(prison_id, current_day):

    dest = Path(f"../outputs/raw/snapshots/{current_day}_{prison_id}.html")

    if dest.exists() : #... load it from file
        print(f"Already have {dest}, loading!")
        page_html = open(dest).read()
    else:
        BASE_URL = "https://www.giustizia.it/giustizia/it/dettaglio_scheda.page?s="
        playwright = await async_playwright().start()
    
        try:
            browser = await playwright.firefox.launch()
            context = await browser.new_context(viewport={'width': 1280, 'height': 800})
            page = await context.new_page()
            await page.goto(f"{BASE_URL}{prison_id}")
            print("Fetching " + prison_id)
            page_html = await page.content()
            
            # Stores html code in dest
            # dest.write_text(page_html)
        finally:
            await browser.close()
            await playwright.stop()
    
    return page_html

In [5]:
# Function to extract institute name and type
def extract_institute_details(soup):
    institute_name = soup.find('h1', {'class': 'titoloIstituto'}).text.strip()
    institute_type = soup.find('h3', {'class': 'titoloIstituto'}).text.strip()
    return institute_name, institute_type

# Function to extract capacity table data
def extract_capacity_data(soup):
    table_capienze = soup.find_all('table')[0]
    rows = table_capienze.find_all('tr')
    if len(rows) > 1:  # Ensure there are rows in the table
        cells = rows[1].find_all('td') 
        posti_regolamentari = int(cells[0].text.strip())
        posti_non_disponibili = int(cells[1].text.strip())
        totale_detenuti = int(cells[2].text.strip())
    else:
        posti_regolamentari = posti_non_disponibili = totale_detenuti = 0
    return posti_regolamentari, posti_non_disponibili, totale_detenuti

# Function to extract updated date
def extract_updated_date(soup, header_text):
    target_span = soup.find('h2', text=header_text)
    if target_span:
        span = target_span.find_next_sibling('span')
        return span.text.strip() if span else 'NA'
    return 'NA'

def extract_personnel_details(soup, header_text):
    target_span = soup.find('h2', text=header_text)
    if target_span:
        try:
            div = target_span.find_next('div', {'class': 'listaContenutiComplessi'})
            spans = div.find_all('span', {'class': 'valoreSottocampo'})
            return spans[0].text.strip(), spans[1].text.strip(), spans[2].text.strip()
        except:
            return 'NA', 'NA', 'NA'
    return 'NA', 'NA', 'NA'

# Function to extract staff table data
def extract_staff_data(soup):
    table_staff = soup.find_all('table')[1]
    cells = table_staff.find_all('td')
    polizia_penitenziaria_effettivi = int(cells[0].text.strip())
    polizia_penitenziaria_previsti = int(cells[1].text.strip())
    amministrativi_effettivi = int(cells[2].text.strip())
    amministrativi_previsti = int(cells[3].text.strip())
    educatori_effettivi = int(cells[4].text.strip())
    educatori_previsti = int(cells[5].text.strip())
    return (polizia_penitenziaria_effettivi, polizia_penitenziaria_previsti,
            amministrativi_effettivi, amministrativi_previsti,
            educatori_effettivi, educatori_previsti)

# Function to extract last update date
def extract_date_of_last_update(soup):
    # Police staff
    target_span= soup.find('h2', text='personale polizia penitenziaria aggiornato al')
    try:
        span = target_span.find_next_sibling('span')
        personale_polizia_aggiornato_al = span.text.strip()
    except:
        personale_polizia_aggiornato_al = 'NA'

    # Administrative staff
    target_span= soup.find('h2', text='personale amministrativo aggiornato al')
    try:
        span = target_span.find_next_sibling('span')
        personale_amministrativo_aggiornato_al = span.text.strip()
    except:
        personale_amministrativo_aggiornato_al = 'NA'

    return personale_polizia_aggiornato_al, personale_amministrativo_aggiornato_al
    

In [6]:
# Function to extract institute data
def get_prison_data(soup, current_day):

    institute_name, institute_type = extract_institute_details(soup)
    posti_regolamentari, posti_non_disponibili, totale_detenuti = extract_capacity_data(soup)
    dati_aggiornati_al = extract_updated_date(soup, 'dati aggiornati al ')
    asl, first_name_asl, last_name_asl = extract_personnel_details(soup, 'Responsabile ASL per il carcere')
    first_name, last_name, role = extract_personnel_details(soup, 'Direttore')
    (polizia_penitenziaria_effettivi, polizia_penitenziaria_previsti, amministrativi_effettivi, amministrativi_previsti, educatori_effettivi, educatori_previsti) = extract_staff_data(soup)
    personale_polizia_aggiornato_al, personale_amministrativo_aggiornato_al = extract_date_of_last_update(soup)

    prison_data = {
            'id': prison_id,
            'nome': institute_name,
            'tipo': institute_type,
            'posti_regolamentari': posti_regolamentari,
            'posti_non_disponibili': posti_non_disponibili,
            'posti_occupati': totale_detenuti,
            'posti_aggiornati_al': dati_aggiornati_al,
            'asl': asl,
            'nome_responsabile_asl': first_name_asl,
            'cognome_responsabile_asl': last_name_asl,
            'nome_direttore': first_name,
            'cognome_direttore': last_name,
            'ruolo_direttore': role,
            'personale_polizia_effettivi': polizia_penitenziaria_effettivi,
            'personale_polizia_previsti': polizia_penitenziaria_previsti,
            'personale_amministrativi_effettivi': amministrativi_effettivi,
            'personale_amministrativi_previsti': amministrativi_previsti,
            'personale_educatori_effettivi': educatori_effettivi,
            'personale_educatori_previsti': educatori_previsti,
            'personale_polizia_aggiornato_a': personale_polizia_aggiornato_al,
            'personale_amministrativo_aggiornato_al': personale_amministrativo_aggiornato_al,
        }
        
    return prison_data

In [7]:
data = []

for prison_id in prison_ids:
    success = False
    for attempt in range(5):
        try:
            html_content = await get_html(prison_id, current_month)
            # Parse the html with BeautifulSoup
            soup = BeautifulSoup(html_content, 'html.parser')
            prison_data = get_prison_data(soup, current_day)

            # Append prison_data to data list
            data.append(prison_data)

            success = True
            break  # Break the retry loop if successful

        except Exception as e:
            print(f"Attempt {attempt+1} failed for prison_id {prison_id}. Error: {e}")
            if attempt < 4:  # If not the last attempt, sleep for 10 seconds before retrying
                sleep(10)

    if not success:
        print(f"Failed to fetch data for prison_id {prison_id} after 5 attempts.")

    # Sleep for 3 seconds before making the next request
    sleep(3)

# Convert prison_data_list to a Pandas DataFrame
data_df = pd.DataFrame(data)

Fetching MII179988


  target_span = soup.find('h2', text=header_text)
  target_span = soup.find('h2', text=header_text)
  target_span= soup.find('h2', text='personale polizia penitenziaria aggiornato al')
  target_span= soup.find('h2', text='personale amministrativo aggiornato al')


Fetching MII172610


Fetching MII172320


Fetching MII173712


Fetching MII173747


Fetching MII177436


Fetching MII178027


Fetching MII178072


Fetching MII178659


Fetching MII179237


Fetching MII182910


Fetching MII181346


Fetching MII181703


Fetching MII181924


Fetching MII179733


Fetching MII173311


Fetching MII173324


Fetching MII177996


Fetching MII179342


Fetching MII179353


Fetching MII180415


Fetching MII181908


Fetching MII159053


Fetching MII172187


Fetching MII173101


Fetching MII173784


Fetching MII172580


Fetching MII176719


Fetching MII180426


Fetching MII181079


Fetching MII179290


Fetching MII180487


Fetching MII179913


Fetching MII180424


Fetching MII159065


Fetching MII173237


Fetching MII174501


Fetching MII181645


Fetching MII176424


Fetching MII181661


Fetching MII181672


Fetching MII179818


Fetching MII181863


Fetching MII181847


Fetching MII181891


Fetching MII182440


Fetching MII173088


Fetching MII157783


Fetching MII173114


Fetching MII173364


Fetching MII159069


Fetching MII173688


Fetching MII173356


Fetching MII176437


Fetching MII173764


Fetching MII181636


Fetching MII172814


Fetching MII176759


Fetching MII176414


Fetching MII176703


Fetching MII177107


Fetching MII177323


Fetching MII176749


Fetching MII176781


Fetching MII176788


Attempt 1 failed for prison_id MII176788. Error: 'NoneType' object has no attribute 'text'


Fetching MII176788


Fetching MII169535


Fetching MII180401


Fetching MII180437


Fetching MII180380


Fetching MII180356


Fetching MII180348


Fetching MII181870


Fetching MII181880


Fetching MII179299


Fetching MII181371


Fetching MII181820


Fetching MII181839


Fetching MII181424


Fetching MII158919


Fetching MII179995


Fetching MII158924


Fetching MII172277


Fetching MII158948


Fetching MII181678


Fetching MII174720


Fetching MII172623


Fetching MII176743


Fetching MII176375


Fetching MII177498


Fetching MII177463


Fetching MII178045


Fetching MII178035


Fetching MII178124


Fetching MII178141


Fetching MII179938


Fetching MII179280


Fetching MII180476


Fetching MII181377


Fetching MII181434


Fetching MII182919


Fetching MII181681


Fetching MII179896


Fetching MII158935


Fetching MII179958


Fetching MII158945


Fetching MII180458


Fetching MII181115


Fetching MII181358


Fetching MII172835


Fetching MII158910


Fetching MII177118


Fetching MII179224


Fetching MII172222


Fetching MII178003


Fetching MII179274


Fetching MII181091


Fetching MII181385


Fetching MII179965


Fetching MII181109


Fetching MII178666


Fetching MII178672


Fetching MII180370


Fetching MII172592


Fetching MII158895


Fetching MII178501


Fetching MII174825


Fetching MII179781


Fetching MII179932


Fetching MII181407


Fetching MII180465


Fetching MII158944


Fetching MII158941


Fetching MII172011


Fetching MII181392


Fetching MII177332


Fetching MII181918


Fetching MII178085


Fetching MII178115


Fetching MII177420


Fetching MII179830


Attempt 1 failed for prison_id MII179830. Error: 'NoneType' object has no attribute 'text'


Fetching MII179830


Fetching MII179945


Attempt 1 failed for prison_id MII179945. Error: 'NoneType' object has no attribute 'text'


Fetching MII179945


Fetching MII177346


Fetching MII181440


Fetching MII159314


Fetching MII173006


Fetching MII173259


Fetching MII173285


Fetching MII173301


Fetching MII173813


Fetching MII173994


Fetching MII176451


Fetching MII176714


Fetching MII178681


Fetching MII179324


Fetching MII179306


Fetching MII179856


Fetching MII180451


Fetching MII181401


Fetching MII181625


Fetching MII172332


Fetching MII179876


Fetching MII172208


Fetching MII177384


Fetching MII181417


Fetching MII173063


Fetching MII158901


Fetching MII173341


Fetching MII173265


Fetching MII172344


Fetching MII181691


Fetching MII160204


Fetching MII181934


Fetching MII172827


Fetching MII176771


Fetching MII159060


Fetching MII177134


Fetching MII178700


Fetching MII178148


Fetching MII152284


Fetching MII176403


Fetching MII182339


Fetching MII180001


Fetching MII174686


Fetching MII179842


Fetching MII172508


Fetching MII179981


Fetching MII179364


Fetching MII178638


Fetching MII181125


In [8]:
data_df

Unnamed: 0,id,nome,tipo,posti_regolamentari,posti_non_disponibili,posti_occupati,posti_aggiornati_al,asl,nome_responsabile_asl,cognome_responsabile_asl,...,cognome_direttore,ruolo_direttore,personale_polizia_effettivi,personale_polizia_previsti,personale_amministrativi_effettivi,personale_amministrativi_previsti,personale_educatori_effettivi,personale_educatori_previsti,personale_polizia_aggiornato_a,personale_amministrativo_aggiornato_al
0,MII179988,Reggio Calabria Arghillà,Casa circondariale,294,1,340,10/08/2024,5 Reggio Calabria,Luciano,Lucania,...,Stendardo,Titolare,133,160,12,32,5,8,31/05/2024,31/05/2024
1,MII172610,Brescia Verziano,Casa di reclusione,71,0,122,10/08/2024,Spedali Civili di Brescia,Luigi,Leone,...,,,77,96,2,0,1,0,31/05/2024,31/05/2024
2,MII172320,Busto Arsizio,Casa circondariale,240,10,436,10/08/2024,Busto Arsizio,Ezia,Iorio,...,Pitaniello,Titolare,180,217,14,21,3,5,31/05/2024,31/05/2024
3,MII173712,Como,Casa circondariale,226,0,438,10/08/2024,Como,Giuseppe,Carrano,...,Rinaldi,Titolare,181,236,15,23,3,6,31/05/2024,31/05/2024
4,MII173747,Cremona,Casa circondariale,394,10,551,10/08/2024,CREMONA,Rossano,Botto,...,Padula,Titolare,169,223,16,23,6,6,31/05/2024,31/05/2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,MII172508,Bologna,Casa circondariale - Rocco D'Amato,498,6,826,10/08/2024,Bologna,Raffaella,Campalastri,...,Casella,Titolare,421,541,27,35,10,10,31/05/2024,31/05/2024
185,MII179981,Ravenna,Casa circondariale,49,0,79,10/08/2024,Romagna,Maurizio,Serra,...,Di Lena,Titolare,66,83,17,14,5,3,31/05/2024,31/05/2024
186,MII179364,Parma,Casa di reclusione,655,25,703,10/08/2024,PARMA,Choroma,FAISSAL,...,,,355,462,2,0,0,0,31/05/2024,31/05/2024
187,MII178638,Modena,Casa circondariale,372,0,537,10/08/2024,MODENA,Stefano,Petrella,...,Sorrentini,Titolare,210,257,22,27,6,6,31/05/2024,31/05/2024


In [9]:
old_data_path = Path('../outputs/clean/istituti_data.csv')

if old_data_path.exists():
  old_data = pd.read_csv('../outputs/clean/istituti_data.csv')
  combined_data = pd.concat([old_data, data_df], ignore_index=True)
  combined_data.drop_duplicates(inplace=True)
else:
  combined_data = data_df
  combined_data.drop_duplicates

combined_data.to_csv('../outputs/clean/istituti_data.csv', index=False)