# Scraper institutes
This notebook is a monthly scraper used to retrieve information about condition in detention centers in Italy. To do so, it uses the id numbers of the various detention centers to navigate to the dedicated webpages with Selenium, store locally the html code of the page and then parse it using BeautifuSoup. The information is then stored in a pandas dataframe and saved as a csv file.

In [8]:
import pandas as pd
import requests
import datetime
from bs4 import BeautifulSoup
import asyncio
from playwright.async_api import async_playwright
from time import sleep
from pathlib import Path

In [9]:
# Collect institutes id numbers
df_institutes = pd.read_csv(f'../outputs/clean/institutes_info.csv')
prison_ids = df_institutes['id_istituto'].tolist()

In [10]:
# Function to grab the html code of the page
async def get_html(prison_id):

    # dest = Path(f"../outputs/raw/snapshots/{current_day}_{prison_id}.html")

    # if dest.exists() : #... load it from file
    #     print(f"Already have {dest}, loading!")
    #     page_html = open(dest).read()
    # else:


    # try:
    await page.goto(f"{BASE_URL}{prison_id}")
    print("Fetching " + f"{BASE_URL}{prison_id}")
    page_html = await page.content()
        
        # Stores html code in dest
        # dest.write_text(page_html)
    # finally:
        # await browser.close()
        # await playwright.stop()
    
    return page_html

In [11]:
# Function to extract institute name and type
def extract_institute_details(soup):
    institute_name = soup.find('h1', {'class': 'titoloIstituto'}).text.strip()
    institute_type = soup.find('h3', {'class': 'titoloIstituto'}).text.strip()
    return institute_name, institute_type

# Function to extract cells data
def extract_cells_data(soup):
    table_cells = soup.find_all('table')[2]
    rows = table_cells.find_all('tr')
    if len(rows) > 1:  # Ensure there are rows in the table
        cells = rows[1].find_all('td') 
        numero_complessivo = int(cells[0].text.strip())
        non_disponibili = int(cells[1].text.strip())
        doccia = int(cells[2].text.strip())
        bidet = int(cells[3].text.strip())
        portatori_di_handicap = int(cells[4].text.strip())
        servizi_igienici_con_porta = int(cells[5].text.strip())
    else:
        numero_complessivo = non_disponibili = doccia = bidet = portatori_di_handicap = servizi_igienici_con_porta = 0
    return numero_complessivo, non_disponibili, doccia, bidet, portatori_di_handicap, servizi_igienici_con_porta


# Function to extract last update date
def extract_date_of_last_update(soup):
    # Police staff
    target_span= soup.find('h2', text='data di aggiornamento spazi detentivi')
    try:
        span = target_span.find_next_sibling('span')
        celle_aggiornate_a = span.text.strip()
    except:
        celle_aggiornate_a = 'NA'
    

In [12]:
# Function to extract institute data
def get_prison_data(soup):


    numero_complessivo, non_disponibili, doccia, bidet, portatori_di_handicap, servizi_igienici_con_porta
    institute_name, institute_type = extract_cells_data(soup)
    dati_aggiornati_al = extract_date_of_last_update(soup, 'dati aggiornati al ')

    prison_data = {
            'numero_complessivo': numero_complessivo,
            'non_disponibili': non_disponibili,
            'doccia': doccia,
            'bidet': bidet,
            'portatori_di_handicap': portatori_di_handicap,
            'servizi_igienici_con_porta': servizi_igienici_con_porta,
            'dati_aggiornati_al': dati_aggiornati_al
        }
        
    return prison_data

In [13]:
data = []

BASE_URL = "https://www.giustizia.it/giustizia/it/dettaglio_scheda.page?s="
# "Hey, open up a browser"
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
# Create a new browser window
page = await browser.new_page()
print("Opening up the browser...")



for prison_id in prison_ids[:2]:
    success = False
    for attempt in range(5):
        try:
            html_content = await get_html(prison_id)
            # Parse the html with BeautifulSoup
            soup = BeautifulSoup(html_content, 'html.parser')
            prison_data = get_prison_data(soup)

            # Append prison_data to data list
            data.append(prison_data)

            success = True
            break  # Break the retry loop if successful

        except Exception as e:
            print(f"Attempt {attempt+1} failed for prison_id {prison_id}. Error: {e}")
            if attempt < 5:  # If not the last attempt, sleep for 10 seconds before retrying
                print("Reinitializing browser...")
                await browser.close()
                sleep(10)
                browser = await playwright.firefox.launch()
                context = await browser.new_context(viewport={'width': 1280, 'height': 800})
                page = await context.new_page()


    if not success:
        print(f"Failed to fetch data for prison_id {prison_id} after 5 attempts.")
    # Sleep for 5 seconds before making the next request
    sleep(5)

await browser.close()

# Convert prison_data_list to a Pandas DataFrame
data_df = pd.DataFrame(data)

Opening up the browser...
Fetching https://www.giustizia.it/giustizia/it/dettaglio_scheda.page?s=MII179988
Attempt 1 failed for prison_id MII179988. Error: name 'numero_complessivo' is not defined
Reinitializing browser...


CancelledError: 

In [None]:
len(data_df)

In [15]:
old_data_path = Path('../outputs/raw/institutes_raw.csv')

if old_data_path.exists():
  old_data = pd.read_csv('../outputs/raw/institutes_raw.csv')
  combined_data = pd.concat([old_data, data_df], ignore_index=True)
  combined_data.drop_duplicates(inplace=True)
else:
  combined_data = data_df
  combined_data.drop_duplicates(inplace=True)

combined_data.to_csv('../outputs/raw/institutes_raw.csv', index=False)

### Institutes - Totals

Some basic cleaning for the dates

In [None]:
combined_data.head()

In [17]:
# Fixing dates
combined_data['posti_aggiornati_al'] = pd.to_datetime(combined_data['posti_aggiornati_al'], dayfirst=True)

combined_data['posti_aggiornati_al'] = combined_data['posti_aggiornati_al'].dt.strftime('%Y-%m-%d')

combined_data['personale_polizia_aggiornato_a'] = pd.to_datetime(combined_data['personale_polizia_aggiornato_a'], format='%d/%m/%Y', errors='coerce').dt.strftime('%Y-%m-%d')
combined_data['personale_amministrativo_aggiornato_al'] = pd.to_datetime(combined_data['personale_amministrativo_aggiornato_al'], format='%d-%m-%Y', errors='coerce').dt.strftime('%Y-%m-%d')

In [None]:
combined_data.tail()

In [19]:
combined_data.to_csv('../outputs/clean/institutes.csv', index=False)