## Bulletines Scraper

This notebook uses the links previously scraped to retrieve data from each monthly bulletine published by the Italian Ministry of Justice regarding inmates and stores the raw data in `outputs/raw/bulletines.csv`.

In [51]:
import pandas as pd
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from time import sleep
import random


In [48]:
# Import links
links_df = pd.read_csv('../outputs/raw/bulletines_links.csv')
links_df.head()


Unnamed: 0,ID,Ultimo aggiornamento,Hyperlink
0,SST1418233,2024-08-31,https://www.giustizia.it/giustizia/it/mg_1_14_...
1,SST1415275,2024-07-31,https://www.giustizia.it/giustizia/it/mg_1_14_...
2,SST1410601,2024-06-30,https://www.giustizia.it/giustizia/it/mg_1_14_...
3,SST1407300,2024-05-31,https://www.giustizia.it/giustizia/it/mg_1_14_...
4,SST1402560,2024-04-30,https://www.giustizia.it/giustizia/it/mg_1_14_...


In [52]:
# "Hey, open up a browser"
playwright = await async_playwright().start()
browser = await playwright.firefox.launch()
context = await browser.new_context(viewport={'width': 1280, 'height': 800})
# Create a new browser window
page = await context.new_page()

print("Opening up the browser...")

data = []
headers = [
    'Regione di detenzione',
    'Sigla Provincia',
    'Istituto',
    'Tipo istituto',
    'Capienza Regolamentare',
    'Detenuti presenti - totale',
    'Detenuti presenti - donne',
    'Detenuti presenti - stranieri',
    'Ultimo aggiornamento',
    'ID',]

for index, row in links_df.iterrows():

    url = row['Hyperlink']
    id_value = row['ID']
    date_value = row['Ultimo aggiornamento']

    await page.goto(url)
    print(f"Going to {url}")
    await page.wait_for_timeout(5000)
    sleep(random.randint(1, 5))

    content = await page.content()
    soup = BeautifulSoup(content, "html.parser")
    rows = soup.find_all("tr")


    for row in rows[2:]:  # Skipping the header rows
        cells = row.find_all("td")
        row_data = [cell.get_text(separator=" ").strip() for cell in cells]

        # Append the Date and ID values to the row_data
        row_data.extend([date_value, id_value])
        data.append(row_data)
    print(f"Scraped page id {id_value}")
    print("#######")
    
    sleep(random.randint(1, 5))
        

await browser.close()

# Create a pandas DataFrame
df = pd.DataFrame(data, columns=headers)


Opening up the browser...
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1418233
Scraped page id SST1418233
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1415275
Scraped page id SST1415275
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1410601
Scraped page id SST1410601
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1407300
Scraped page id SST1407300
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1402560
Scraped page id SST1402560
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST466343
Scraped page id SST466343
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST462936
Scraped page id SST462936
#######
Going to https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST459023
Scraped page id SST459023
#######
Going to https://www.giustizia.it/gi

In [45]:
df.tail()

Unnamed: 0,Regione di detenzione,Sigla Provincia,Istituto,Tipo istituto,Capienza Regolamentare,Detenuti presenti - totale,Detenuti presenti - donne,Detenuti presenti - stranieri,Ultimo aggiornamento,ID
375,VENETO,VE,"VENEZIA ""GIUDECCA""",CRF,112.0,94,94,45.0,2024-07-31,SST1415275
376,VENETO,VE,"VENEZIA ""SANTA MARIA MAGGIORE""",CC,159.0,248,,149.0,2024-07-31,SST1415275
377,VENETO,VI,VICENZA -,CC,276.0,357,,140.0,2024-07-31,SST1415275
378,VENETO,VR,"VERONA ""MONTORIO""",CC,335.0,576,37,345.0,2024-07-31,SST1415275
379,Totale,51.207,61.133,2.682,19.15,2024-07-31,SST1415275,,,


In [53]:
len(df)

12926

In [54]:
# Store df
df.to_csv('../outputs/raw/bulletine_raw.csv', index=False, encoding="UTF-8-sig")