# Scraper institutes
This notebook is a monthly scraper used to retrieve information about condition in detention centers in Italy. To do so, it uses the id numbers of the various detention centers to navigate to the dedicated webpages with Selenium, store locally the html code of the page and then parse it using BeautifuSoup. The information is then stored in a pandas dataframe and saved as a csv file.

In [1]:
import pandas as pd
import requests
import datetime
from bs4 import BeautifulSoup
import asyncio
from playwright.async_api import async_playwright
from time import sleep
from pathlib import Path

In [2]:
# Collect institutes id numbers
df_institutes = pd.read_csv(f'../outputs/clean/institutes_info.csv')
prison_ids = df_institutes['id_istituto'].tolist()

In [3]:
# Function to grab the html code of the page
async def get_html(prison_id):

    await page.goto(f"{BASE_URL}{prison_id}")
    print("Fetching " + f"{BASE_URL}{prison_id}")
    page_html = await page.content()
        
    return page_html

# Function to extract table data
def extract_table_data(h2_text):
    # Step 3: Find the specific <h2> tag by its text
    h2_tag = soup.find('h2', class_='h2 campoComplessoTitolo', string=h2_text)


    # Step 4: Find the table immediately following the <h2> tag
    if h2_tag:
        table = h2_tag.find_next('table')
        
        # Step 5: Extract data from the table into a DataFrame
        if table:
            data_list = []  # List to hold the extracted data
            rows = table.find_all('tr')
            
            # Extract headers from the first row
            headers = []
            if rows:
                header_row = rows[0].find_all('th')
                headers = [header.get_text(strip=True) for header in header_row]
            
            # Extract data from the rest of the rows
            for row in rows[1:]:  # Start from the second row
                columns = row.find_all(['td'])  # Only data cells
                data = [col.get_text(strip=True) for col in columns]
                data_list.append(data)  # Add the row data to the list

            # Create a DataFrame with headers
            df = pd.DataFrame(data_list, columns=headers)

            return df
        else:
            print("Table not found after the specified <h2> tag.")
    else:
        print(f"<h2> tag with text '{h2_text}' not found.")
    
    return pd.DataFrame()  # Return an empty DataFrame if nothing is found

def extract_date(soup, date_text):
    # Police staff
    target_span= soup.find('h2', string=date_text)
    try:
        span = target_span.find_next_sibling('span')
        date = span.text.strip()
    except:
        date = 'NA'

    return date

def extract_info(soup):
    institute_name = soup.find('h1', {'class': 'titoloIstituto'}).text.strip()
    institute_type = soup.find('h3', {'class': 'titoloIstituto'}).text.strip()
    return institute_name, institute_type

In [None]:
BASE_URL = "https://www.giustizia.it/giustizia/it/dettaglio_scheda.page?s="

data = []

updates = [
    'dati aggiornati al ',
    'personale polizia penitenziaria aggiornato al',
    'personale amministrativo aggiornato al',
    'data di aggiornamento spazi detentivi',
    ]

playwright = await async_playwright().start()
browser = await playwright.firefox.launch()
context = await browser.new_context(viewport={'width': 1280, 'height': 800})
page = await context.new_page()

for prison_id in prison_ids:
    success = False
    for attempt in range(5):
        try:
            print(f"Attempt number {attempt+1} at fetching data for institute id {prison_id}")
            html_content = await get_html(prison_id)
            soup = BeautifulSoup(html_content, 'html.parser')

            capienza_df = extract_table_data('Capienza e presenze')
            spazi_df = extract_table_data('Stanze di detenzione')
            personale_df = extract_table_data('Personale')

            # Combine the DataFrames
            merged_df = pd.concat([
                capienza_df.reset_index(drop=True), 
                personale_df.reset_index(drop=True),
                spazi_df.reset_index(drop=True)
            ], axis=1)

            institute_name, institute_type = extract_info(soup)

            # Now assign the values to new columns in the DataFrame
            merged_df['nome istituto'] = institute_name
            merged_df['tipo istituto'] = institute_type

            for update in updates:
                merged_df[update] = extract_date(soup, update)

            merged_df['id istituto'] = prison_id
            merged_df.columns = merged_df.columns.str.strip()

            data.append(merged_df)

            success = True  # Mark as successful
            print("Success!")
            print()
            break

        except Exception as e:
            print(f"Attempt {attempt + 1} failed for prison id {prison_id}. Error: {e}")
            if attempt < 4:  # Only reinitialize if not on the last attempt
                print("Reinitializing browser...")
                await browser.close()
                await asyncio.sleep(10)
                browser = await playwright.firefox.launch()
                context = await browser.new_context(viewport={'width': 1280, 'height': 800})
                page = await context.new_page()

    if not success:
        print(f"Failed to fetch data for prison id {prison_id} after 5 attempts.")

    await asyncio.sleep(5)  # Use asyncio.sleep for asynchronous sleep

await browser.close()

# Convert the collected data to a Pandas DataFrame
final_df = pd.concat(data, ignore_index=True)

# Reorder the DataFrame
new_column_order = ['id istituto', 'nome istituto', 'tipo istituto'] + \
                    [col for col in final_df.columns if col not in ['id istituto', 'nome istituto', 'tipo istituto']]
final_df = final_df[new_column_order]

#Fixing dates
for update in (u.strip() for u in updates):
    final_df[update] = pd.to_datetime(final_df[update], dayfirst=True, errors='coerce')
    final_df[update] = final_df[update].dt.strftime('%Y-%m-%d')

In [None]:
old_data_path = Path('../outputs/clean/institutes.csv')

if old_data_path.exists():
  old_data = pd.read_csv('../outputs/clean/institutes.csv')
  combined_data = pd.concat([old_data, final_df], ignore_index=True)
  combined_data = combined_data.drop_duplicates()
  combined_data.to_csv('../outputs/clean/institutes.csv', index=False, encoding='UTF-8-sig')

else:
  final_df = final_df.drop_duplicates(inplace=True)
  final_df.to_csv('../outputs/clean/institutes.csv', index=False, encoding='UTF-8-sig')