# Scraper global

This notebook is a scraper used to retrieve information about the presence of inmates in Italian detention centers. To do so it uses the monthly bulletines published by the Italian Ministry of Justice. It is worth noticing that these bulletines do not include information regarding space availability, so the overcrowding rate is not calculated.

In [38]:
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
import locale
from pathlib import Path
import re

from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    MoveTargetOutOfBoundsException,
    TimeoutException,
    WebDriverException,
)
from selenium.common.exceptions import NoSuchElementException
from sendgrid import SendGridAPIClient
from sendgrid.helpers.mail import (Mail, Attachment, FileContent, FileName, FileType, Disposition)
from selenium.webdriver.firefox.options import Options
from selenium.webdriver import Firefox

### Getting links to monthly bulletins with Selenium

This first part of the scraper uses Selenium to navigate through the Statistiche page of the Ministry of Justice website and retrieve the links to the monthly bulletins.

In [39]:
def open_browser():
    """
    Opens a new automated browser window with all tell-tales of automated browser disabled
    """
    options = Options()
    options.add_argument("--start-maximized")
    options.add_argument("-headless")  # Enable headless mode

    # Remove all signs of this being an automated browser
    options.set_preference("dom.webdriver.enabled", False)
    options.set_preference("useAutomationExtension", False)
    options.set_preference("marionette.enabled", True)

    # Open the browser with the new options
    driver = Firefox(options=options)
    return driver

In [40]:
# Define string to look for
target = f'detenuti italiani e stranieri presenti e capienze per istituto'

In [41]:
# Open the browser
driver = open_browser()
print("Opening the browser")
sleep(4)

# Go to the search page
url = 'https://www.giustizia.it/giustizia/page/it/statistiche'
driver.get(url)
sleep(3)

# Find the Search box
search_box = driver.find_element(
By.XPATH, 
'/html/body/header/div[2]/div[1]/div/div/div/div/div[2]/form/div/span/input[1]'
)

# Search for the target words
search_box.send_keys(target)
search_term = driver.find_element(
By.XPATH, 
'/html/body/header/div[2]/div[1]/div/div/div/div/div[2]/form/div/button'
)
search_term.click()
print(f"Searching for {target}")
sleep(5)

links = []

while True:

    try:
        # Select results
        search_results = driver.find_element(By.CSS_SELECTOR, 'ol.resultVivisimo')
        elems = search_results.find_elements(
        By.TAG_NAME, 'strong')

        # Look for matches with target string
        for elem in elems:

            link_text = elem.text

            if target in link_text.lower():
                print(f'{target} found!')
                link_element = elem.find_element(By.TAG_NAME, 'a')
                link = link_element.get_attribute('href')
                links.append(link)

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") 
        next_page = driver.find_element(By.XPATH, '//img[contains(@src, "/giustizia/resources/static/img/pager_formBlock/next.png")]')
        next_page.click()
        sleep(randint(2,8))

    except NoSuchElementException:
        print("Reached the last page.")
        print("Closing the browser")
        driver.quit()
        break

Opening the browser
Searching for detenuti italiani e stranieri presenti e capienze per istituto
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto found!
detenuti italiani e stranieri presenti e capienze per istituto 

### Scraping tables with requests and BeautifulSoup

This second part of the scraper uses requests and BeautifulSoup to extract raw data from the bulletines.

In [52]:
# To convert date in letters to datetime format
locale.setlocale(locale.LC_TIME, 'it_IT.UTF-8')

ident = ("Marco Dalla Stella (md3934@columbia.edu)")

# Initialize list
all_data = []

# Set up columns to store data
columns = [
    'url',
    'dati_aggiornati_al',
    'descrizione',
    'regione_di_detenzione',
    'sigla_provincia',
    'nome_istituto',
    'tipo_istituto',
    'capienza_regolamentare',
    'detenuti_totale',
    'detenuti_donne',
    'detenuti_stranieri'
]


for link in links:
    link_id = re.search(r'Id=(.*)', link).group(1)

    dest = Path("../outputs/raw/bollettini/" + link_id + ".html")

    if dest.exists(): # .... load it from file
        print(f"Already have {dest}, loading!")
        page_html = open(dest).read()

    else: # ... fetch it
        MAX_RETRIES = 5
        for i in range(MAX_RETRIES):
            try:
                print(f"Fetching {dest}")
                # Open the browser
                driver = open_browser()
                print("Opening the browser")

                # Go to the page
                driver.get(link)
                sleep(5)
                page_html = driver.page_source

                with open(dest, 'w') as f:
                    f.write(page_html)
                    print(f"Page saved in {dest}. Extracting data.")

                driver.quit()
                break

            except Exception as e:
                print(f"Error occurred: {e}. Retrying in 10 seconds.")
                sleep(10)

    # Parse it using BeautifulSoup            
    soup = BeautifulSoup(page_html)

    # Fix date if wrong
    date = soup.find("p", class_="date").text
    title = soup.find_all("h2")[1].text.lower()
    date_title = title.split("- ")[1].strip()
    date_title = date_title.replace("aggiornamento al ", "").lower()
    

    if date_title == date:
        date = date
    else:
        print(f"Date fixed from {date} to {date_title}")
        date = date_title

    tables = soup.select("table")
    table = tables[0]
    row_els = table.select("tr")
    
    table_data = []
    
    for row in row_els:
        row_cells = [link, date, title]
        for i, cell in enumerate(row.select("td")):
            if i < 4:
                row_cells.append(cell.text) # Extract only digits from last four columns
            else:
                number = cell.text.replace('.', '')  # Remove dots
                row_cells.append(number)
                        
            table_data.append(row_cells)

    all_data.extend(table_data)

    print("################")

Already have ../outputs/raw/bollettini/SST319895.html, loading!
################
Already have ../outputs/raw/bollettini/SST314850.html, loading!
################
Already have ../outputs/raw/bollettini/SST308223.html, loading!
################
Already have ../outputs/raw/bollettini/SST301069.html, loading!
################
Already have ../outputs/raw/bollettini/SST301066.html, loading!
################
Already have ../outputs/raw/bollettini/SST289764.html, loading!
################
Already have ../outputs/raw/bollettini/SST285903.html, loading!
################
Already have ../outputs/raw/bollettini/SST281553.html, loading!
################
Already have ../outputs/raw/bollettini/SST276692.html, loading!
################
Already have ../outputs/raw/bollettini/SST267794.html, loading!
################
Already have ../outputs/raw/bollettini/SST245520.html, loading!
################
Already have ../outputs/raw/bollettini/SST250612.html, loading!
################
Already have ../outputs/raw/

In [53]:
df = pd.DataFrame(all_data, columns=columns)


In [54]:
df_temp = df.dropna(subset=['regione_di_detenzione'])
df_temp['anno'] = df_temp['dati_aggiornati_al'].str[-4:]
df_temp['mese'] = df_temp['dati_aggiornati_al'].str.extract(r'([^0-9]+)')
df_temp.groupby('anno')['mese'].nunique()

anno
2016     1
2017     1
2018    12
2019    12
2020    12
2021    12
2022    12
2023    12
2024     2
Name: mese, dtype: int64

We have some missing data from 2017, while I could not find the additional data from 2016.

In [55]:
missing_links = [
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST67921',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST59610',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST48940',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST43408',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST39702',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST35918',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1333314',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1328012',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1322605',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1315644',
    'https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST1309669'
    ]

# Initialize list
missing_data = []

# Set up columns to store data
columns = [
    'url',
    'dati_aggiornati_al',
    'descrizione',
    'regione_di_detenzione',
    'sigla_provincia',
    'nome_istituto',
    'tipo_istituto',
    'capienza_regolamentare',
    'detenuti_totale',
    'detenuti_donne',
    'detenuti_stranieri'
]


for missing_link in missing_links:
    link_id = re.search(r'Id=(.*)', missing_link).group(1)

    dest = Path("../outputs/raw/bollettini/" + link_id + ".html")

    if dest.exists(): # .... load it from file
        print(f"Already have {dest}, loading!")
        page_html = open(dest).read()

    else: # ... fetch it
        MAX_RETRIES = 5
        for i in range(MAX_RETRIES):
            try:
                print(f"Fetching {dest}")
                # Open the browser
                driver = open_browser()
                print("Opening the browser")

                # Go to the page
                driver.get(missing_link)
                sleep(5)
                page_html = driver.page_source

                with open(dest, 'w') as f:
                    f.write(page_html)
                    print(f"Page saved in {dest}. Extracting data.")

                driver.quit()
                break

            except Exception as e:
                print(f"Error occurred: {e}. Retrying in 10 seconds.")
                sleep(10)

    # Parse it using BeautifulSoup            
    soup = BeautifulSoup(page_html)

    # Fix date if wrong
    date = soup.find("p", class_="date").text
    title = soup.find_all("h2")[1].text.lower()
    date_title = title.split("- ")[1].strip()
    date_title = date_title.replace("aggiornamento al ", "").lower()
    

    if date_title == date:
        date = date
    else:
        print(f"Date fixed from {date} to {date_title}")
        date = date_title

    tables = soup.select("table")
    table = tables[0]
    row_els = table.select("tr")
    
    table_data = []
    
    for row in row_els:
        row_cells = [missing_link, date, title]
        for i, cell in enumerate(row.select("td")):
            if i < 4:
                row_cells.append(cell.text) # Extract only digits from last four columns
            else:
                number = cell.text.replace('.', '')  # Remove dots
                row_cells.append(number)
                        
        table_data.append(row_cells)

    missing_data.extend(table_data)

    print("################")

missing_data_df = pd.DataFrame(missing_data, columns=columns)

Already have ../outputs/raw/bollettini/SST67921.html, loading!
################
Already have ../outputs/raw/bollettini/SST59610.html, loading!
################
Already have ../outputs/raw/bollettini/SST48940.html, loading!
################
Already have ../outputs/raw/bollettini/SST43408.html, loading!
################
Already have ../outputs/raw/bollettini/SST39702.html, loading!
################
Already have ../outputs/raw/bollettini/SST35918.html, loading!
################
Already have ../outputs/raw/bollettini/SST1333314.html, loading!
################
Already have ../outputs/raw/bollettini/SST1328012.html, loading!
################
Already have ../outputs/raw/bollettini/SST1322605.html, loading!
################
Already have ../outputs/raw/bollettini/SST1315644.html, loading!
################
Already have ../outputs/raw/bollettini/SST1309669.html, loading!
################


In [56]:
merged_df = pd.concat([df, missing_data_df], ignore_index=True)

In [57]:
merged_df = merged_df.drop_duplicates()

In [58]:
df_temp = merged_df.dropna(subset=['regione_di_detenzione'])
df_temp['anno'] = df_temp['dati_aggiornati_al'].str[-4:]
df_temp['mese'] = df_temp['dati_aggiornati_al'].str.extract(r'([^0-9]+)')
df_temp.groupby('anno')['mese'].nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['anno'] = df_temp['dati_aggiornati_al'].str[-4:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['mese'] = df_temp['dati_aggiornati_al'].str.extract(r'([^0-9]+)')


anno
2016     1
2017    12
2018    12
2019    12
2020    12
2021    12
2022    12
2023    12
2024     2
Name: mese, dtype: int64

In [60]:
merged_df.to_csv('../outputs/raw/bollettini_mensili_raw.csv', index=False, encoding='utf-8-sig')