In [1]:
from playwright.async_api import async_playwright
import asyncio
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import re

In [3]:
url = 'https://www.giustizia.it/giustizia/page/it/statistiche'

In [4]:
# "Hey, open up a browser"
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)

# Create a new browser window
page = await browser.new_page()
print("Opening up the browser...")

# Tell it to go to this page
await page.goto(url)
print(f"Going to {url}")

await page.wait_for_timeout(2000)

to_search= 'Detenuti italiani e stranieri presenti e capienze per istituto'

search_input = page.locator('form#searchForm input[aria-label="Cerca"]')

await search_input.fill(to_search)
await page.wait_for_timeout(2000)
await search_input.press('Enter')
print(f"Searching for {to_search}")

# Wait for the results to load
await page.wait_for_selector('ol.resultVivisimo', timeout=5000)

# Data storage
data = []
n = 1


while True:
    content = await page.content()
    await page.wait_for_timeout(5000)
    soup = BeautifulSoup(content, 'html.parser')
    links = soup.find_all('a', href=True)

    # Filter and extract the relevant links
    filtered_links = [
        link for link in links 
        if "contentId" in link['href'] and "Detenuti italiani e stranieri presenti e capienze per istituto" in link.get_text()
    ]

    for link in filtered_links:
            href = link['href']
            text = link.get_text(strip=True)
            content_id = re.search(r"contentId=(\w+)", href).group(1)
            last_update = text.split("aggiornamento al")[-1].strip()
            data.append([content_id, last_update, href])
    print(f'got link from page {n}')
    print(f"Total number of links: {len(data)}")
    print("##################")
    n = n+1

    next_button = await page.query_selector('img[alt="Vai alla pagina successiva"]')  # Select the image by alt text
    await page.wait_for_timeout(5000)

    # Adding a sleep to give the page some time
    await page.wait_for_timeout(5000)  # Wait for 1 second before checking for the button

    if next_button:
        print('next button found')
        # Check if the button is visible and can be clicked
        is_visible = await next_button.is_visible()
        is_enabled = await next_button.is_enabled()  # Check if the button is enabled

        if is_visible and is_enabled:
            print(f"Clicking on page {n}")
            # Click the "Next" button
            await next_button.click()
            await page.wait_for_selector('ol.resultVivisimo', timeout=5000)  # Additional wait time for the new page to load
        else:
            print('button not visible or enabled')
            break  # If the button is not visible or not enabled, break the loop
    else:
        print('button not found')
        break  # If no next button, break the loop

# Finally close the browser after everything is done
await browser.close()

Opening up the browser...
Going to https://www.giustizia.it/giustizia/page/it/statistiche
Searching for Detenuti italiani e stranieri presenti e capienze per istituto
got link from page 1
Total number of links: 15
##################
next button found
Clicking on page 2
got link from page 2
Total number of links: 30
##################
next button found
Clicking on page 3
got link from page 3
Total number of links: 45
##################
next button found
Clicking on page 4
got link from page 4
Total number of links: 60
##################
next button found
Clicking on page 5
got link from page 5
Total number of links: 69
##################
button not found


In [5]:
# Create a DataFrame and convert dates
df = pd.DataFrame(data, columns=["ID", "Ultimo aggiornamento", "Hyperlink"])

# Month mapping for conversion to datetime
month_mapping = {
    "Gennaio": "January", "gennaio": "January",
    "Febbraio": "February", "febbraio": "February",
    "Marzo": "March", "marzo": "March",
    "Aprile": "April", "aprile": "April",
    "Maggio": "May", "maggio": "May",
    "Giugno": "June", "giugno": "June",
    "Luglio": "July", "luglio": "July",
    "Agosto": "August", "agosto": "August",
    "Settembre": "September", "settembre": "September",
    "Ottobre": "October", "ottobre": "October",
    "Novembre": "November", "novembre": "November",
    "Dicembre": "December", "dicembre": "December"
}

for italian, english in month_mapping.items():
    df['Ultimo aggiornamento'] = df['Ultimo aggiornamento'].str.replace(italian, english, regex=True)

df['Ultimo aggiornamento'] = pd.to_datetime(df['Ultimo aggiornamento'], format='%d %B %Y')
df = df.sort_values(by='Ultimo aggiornamento', ascending=False)

df.head()

Unnamed: 0,ID,Ultimo aggiornamento,Hyperlink
0,SST1418233,2024-08-31,https://www.giustizia.it/giustizia/it/mg_1_14_...
1,SST1415275,2024-07-31,https://www.giustizia.it/giustizia/it/mg_1_14_...
2,SST1410601,2024-06-30,https://www.giustizia.it/giustizia/it/mg_1_14_...
3,SST1407300,2024-05-31,https://www.giustizia.it/giustizia/it/mg_1_14_...
4,SST1402560,2024-04-30,https://www.giustizia.it/giustizia/it/mg_1_14_...


In [6]:
df['Ultimo aggiornamento'].dt.year.value_counts().sort_index()

2019    12
2020    12
2021    13
2022    12
2023    12
2024     8
Name: Ultimo aggiornamento, dtype: int64

In [7]:
df.to_csv('../outputs/raw/bulletines_links.csv', index=False, encoding='UTF-8')

In [8]:
# Remove one duplicate (SST365607 = SST360932)
duplicate_id = 'SST365607'
df = df[df['ID'] != 'SST365607']

Note: october 2021 completely missing!

In [9]:
df.to_csv('../outputs/raw/monthly_bulletines.csv', index=False, encoding='UTF-8')