## Bulletines Scraper

This notebook uses the links previously scraped to retrieve data from each monthly bulletine published by the Italian Ministry of Justice regarding inmates and stores the raw data in `outputs/raw/bulletines.csv`.

In [14]:
import pandas as pd
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from time import sleep
import random


In [15]:
# Import links
links_df = pd.read_csv('../outputs/clean/bulletines_links.csv')
links_df.head()


Unnamed: 0,ID,Ultimo aggiornamento,Hyperlink
0,SST1418233,2024-08-31,https://www.giustizia.it/giustizia/it/mg_1_14_...
1,SST1415275,2024-07-31,https://www.giustizia.it/giustizia/it/mg_1_14_...
2,SST1410601,2024-06-30,https://www.giustizia.it/giustizia/it/mg_1_14_...
3,SST1407300,2024-05-31,https://www.giustizia.it/giustizia/it/mg_1_14_...
4,SST1402560,2024-04-30,https://www.giustizia.it/giustizia/it/mg_1_14_...


In [16]:
# "Hey, open up a browser"
playwright = await async_playwright().start()
browser = await playwright.firefox.launch()
context = await browser.new_context(viewport={'width': 1280, 'height': 800})
# Create a new browser window
page = await context.new_page()

print("Opening up the browser...")

data = []
headers = [
    'Regione di detenzione',
    'Sigla Provincia',
    'Istituto',
    'Tipo istituto',
    'Capienza Regolamentare',
    'Detenuti presenti - totale',
    'Detenuti presenti - donne',
    'Detenuti presenti - stranieri',
    'Ultimo aggiornamento',
    'ID',]

for index, row in links_df.iterrows():

    url = row['Hyperlink']
    id_value = row['ID']
    date_value = row['Ultimo aggiornamento']

    # Attempt to navigate to the URL. Retry if timeout occurs.
    retry = 0
    while retry < 5:
        try:
            print(f"Going to {url}")
            await page.goto(url, timeout=60000)
            await page.wait_for_timeout(5000)
            sleep(random.randint(1, 5))

            # Scrape the content
            content = await page.content()
            soup = BeautifulSoup(content, "html.parser")
            rows = soup.find_all("tr")


            for row in rows[2:]:  # Skipping the header rows
                cells = row.find_all("td")
                row_data = [cell.get_text(separator=" ").strip() for cell in cells]

                # Append the Date and ID values to the row_data
                row_data.extend([date_value, id_value])
                data.append(row_data)
            print(f"Scraped page id {id_value}")
            print("#######")
            sleep(random.randint(1, 5))
            break

        except Exception as e:
            print(f"Timeout while loading {e}, retrying...")

            # Close current browser, wait and reopen
            print("Reinitializing browser...")
            await browser.close()
            sleep(random.randint(1,5))
            browser = await playwright.firefox.launch()
            context = await browser.new_context(viewport={'width': 1280, 'height': 800})
            page = await context.new_page()

            retry += 1

await browser.close()
await playwright.stop()

# Create a pandas DataFrame
df = pd.DataFrame(data, columns=headers)


Opening up the browser...
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1418233
Scraped page id SST1418233
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1415275
Scraped page id SST1415275
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1410601
Scraped page id SST1410601
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1407300
Scraped page id SST1407300
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1402560
Scraped page id SST1402560
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST466343
Scraped page id SST466343
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST462936
Scraped page id SST462936
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST459023
Scraped page id SST459023
#######
Going to https://www.giustizia.it/gi

In [None]:
df.tail()

In [None]:
len(df)

In [17]:
# Store df
df.to_csv('../outputs/raw/bulletines_raw.csv', index=False, encoding="UTF-8-sig")