### Weekly Scraper (Bulletines)

This is a Scraper that uses Github Actions to run monthly and retrieve updated data

In [1]:
from playwright.async_api import async_playwright
import asyncio
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import re
from datetime import datetime, timedelta

In [2]:
# Mapping of month numbers to Italian month names
italian_months = [
    "gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno",
    "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"
]

# Get the current date
today = datetime.now()

# Calculate the first day of the current month
first_day_of_current_month = today.replace(day=1)
#  Calculate the last day of the previous month
last_day_of_previous_month = first_day_of_current_month - timedelta(days=1)

# Format the date in Italian manually
day = last_day_of_previous_month.day
month = italian_months[last_day_of_previous_month.month - 1]  # Adjust for zero-based index
year = last_day_of_previous_month.year

# Create the final formatted date string
formatted_date = f"{day} {month} {year}"
formatted_date

'31 marzo 2025'

In [3]:
date_value = last_day_of_previous_month.strftime('%Y-%m-%d')
date_value

'2025-03-31'

In [4]:
url = 'https://www.giustizia.it/giustizia/page/it/statistiche'
to_search= f'Detenuti italiani e stranieri presenti e capienze per istituto - aggiornamento al {formatted_date}'

In [5]:
# # "Hey, open up a browser"
playwright = await async_playwright().start()
browser = await playwright.firefox.launch()
context = await browser.new_context(viewport={'width': 1280, 'height': 800})
page = await context.new_page()

# "Hey, open up a browser"
# playwright = await async_playwright().start()
# browser = await playwright.chromium.launch(headless=False)
# page = await browser.new_page()

print("Opening up the browser...")

# Tell it to go to this page
await page.goto(url, timeout=0)

print(f"Going to {url}")

search_input = page.locator('form#searchForm input[aria-label="Cerca"]')

await search_input.fill(to_search)
await page.wait_for_timeout(2000)
await search_input.press('Enter')
print(f"Searching for {to_search}")

# Wait for the results to load
await page.wait_for_selector('ol.resultVivisimo', timeout=0)

# Locate all the search results
search_results = page.locator('ol.resultVivisimo li a')

# Check if any link matches the search text
links = await search_results.all_text_contents()

if to_search in links:

    data = []

    # Click the link if found
    await page.click(f'li:has-text("{to_search}") >> a')
    print(f"Clicked on the link: {to_search}")

    await page.wait_for_selector('table', timeout=0)  # Adjust the selector to match the table or content you expect to appear
    await page.wait_for_load_state('networkidle', timeout=0)
    content = await page.content()

    target_link = page.url
    print(f"Current page URL: {target_link}")
    match = re.search(r'contentId=([^&]+)', target_link)
    id_value = match.group(1) if match else None


    soup = BeautifulSoup(content, "html.parser")
    rows = soup.find_all("tr")


    for row in rows[2:]:  # Skipping the header rows
        cells = row.find_all("td")
        row_data = [cell.get_text(separator=" ").strip() for cell in cells]

        # Append the Date and ID values to the row_data
        row_data.extend([date_value, id_value])
        data.append(row_data)
    print(f"Scraped page")
    print("#######")

    # Creating df
    df = pd.DataFrame(data)

    # Adding data to the csv file
    bulletines_csv = '../outputs/raw/bulletines_raw.csv'
    df.to_csv(bulletines_csv, mode='a', index=False, header=False)
    print('New data added to outputs/raw/bulletines_raw.csv')
    print('Closing the browser')
    await browser.close()


else:
    # Close the browser if the link is not found
    print(f"Link not found for: {to_search}")
    await browser.close()


Opening up the browser...


Going to https://www.giustizia.it/giustizia/page/it/statistiche


Searching for Detenuti italiani e stranieri presenti e capienze per istituto - aggiornamento al 31 marzo 2025
Link not found for: Detenuti italiani e stranieri presenti e capienze per istituto - aggiornamento al 31 marzo 2025
