## Prisoners: presence of prisoners compared to capacity per region

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import calendar
import locale
from random import randint

### 1. Presence v Capacity
Extracting monthly data regarding the presence of prisoners in the Italian penitentiary system on regional basis. Data is published on a monthly basis by the Italian Ministry of Justice.

In [17]:
urls = pd.read_csv('../data/urls.csv')
links = urls.iloc[:, 1].tolist()

# Set locale to Italian to get month names
locale.setlocale(locale.LC_TIME, 'it_IT')
italian_to_english_months = {month: calendar.month_name[i] for i, month in enumerate(calendar.month_name[1:], 1)}

data = []

for link in links:

    html = requests.get(link).text
    soup = BeautifulSoup(html, 'html.parser')
    date_element = soup.find('p', class_='date').text

    for italian, english in italian_to_english_months.items():
        date_element = date_element.replace(italian, english)

    date = pd.to_datetime(date_element, format='%d %B %Y', exact=False, errors='coerce')

    if pd.isnull(date):
        dated = "Unknown"
    else:
        date.strftime('%Y-%m-%d')

    table = soup.find('table')
    rows = table.find_all('tr')[2:]

    for row in rows:
        cols = row.find_all(['th', 'td'])
        regione = cols[0].text.strip()

        if not "totale" in regione.lower():
            capienza_regolamentare = cols[2].text.strip()
            detenuti_presenti = cols[3].text.strip()
            data.append([regione, capienza_regolamentare, detenuti_presenti, date])

    print(f"Fetched data about {link}, {date}")
    print("########")
    sleep(randint(1,3))

df = pd.DataFrame(data, columns=['Region', 'Capacity', 'Present', 'Date'])

Fetched data about https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST94315, 2018-01-31 00:00:00
########
Fetched data about https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST99008, 2018-02-28 00:00:00
########
Fetched data about https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST107117, 2018-03-31 00:00:00
########
Fetched data about https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST156266, 2018-04-30 00:00:00
########
Fetched data about https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST119037, 2018-05-31 00:00:00
########
Fetched data about https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST125872, 2018-06-30 00:00:00
########
Fetched data about https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST131301, 2018-07-31 00:00:00
########
Fetched data about https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST135462, 2018-08-31 00:00:00
########
Fetched data about https:/

In [19]:
df.sample(10)

Unnamed: 0,Region,Capacity,Present,Date
234,SICILIA,6.497,6.469,2018-12-31
1358,VALLE D'AOSTA,181.0,114.0,2023-08-31
1044,EMILIA ROMAGNA,2.998,3.291,2022-05-31
127,LIGURIA,1.129,1.455,2018-07-31
591,PIEMONTE,3.938,4.176,2020-06-30
405,FRIULI VENEZIA GIULIA,479.0,666.0,2019-09-30
305,FRIULI VENEZIA GIULIA,480.0,641.0,2019-04-30
360,ABRUZZO,1.645,2.077,2019-07-31
524,EMILIA ROMAGNA,3.002,3.261,2020-03-31
205,FRIULI VENEZIA GIULIA,480.0,630.0,2018-11-30


In [1]:
df.to_csv('../outputs/capacityvpresence.csv', index=False)

NameError: name 'df' is not defined