# Scraper gobal monthly

This notebook is a monthly scraper used to retrieve information about monthly bulletines about inmates in detention centers in Italy.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import datetime
import re

from pathlib import Path
from random import randint
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By

ModuleNotFoundError: No module named 'seleniumwire'

In [6]:
months_eng_ita = {
    "january": "gennaio",
    "february": "febbraio",
    "march": "marzo",
    "april": "aprile",
    "may": "maggio",
    "june": "giugno",
    "july": "luglio",
    "august": "agosto",
    "september": "settembre",
    "october": "ottobre",
    "november": "novembre",
    "december": "dicembre"
}


# Get the current date
current_date = datetime.datetime.now()

day = (current_date.replace(day=1) - datetime.timedelta(days=1)).day
month = (current_date - datetime.timedelta(days=current_date.day)).strftime("%B").lower()
month = months_eng_ita.get(month)

year = current_date.year

target = f'Detenuti italiani e stranieri presenti e capienze per istituto - aggiornamento al {day} {month} {year}'

## Selenium
This section of the notebook uses Selenium to look for any updated data by searching for the target string and the last day of the previous month.

In [26]:
def open_browser():
    """
    Opens a new automated browser window with all tell-tales of automated browser disabled
    """
    options = Options()
    options.add_argument("--start-maximized")

    # Enable headless mode
    options.add_argument("--headless")

    # remove all signs of this being an automated browser
    options.set_preference("dom.webdriver.enabled", False)
    options.set_preference('useAutomationExtension', False)
    options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0")

    # open the browser with the new options
    driver = webdriver.Firefox(options=options)
    return driver

In [27]:
# Open up the browser, go to page
driver = open_browser()
url = 'https://www.giustizia.it/giustizia/page/it/statistiche'
driver.get(url)

sleep(3)

# Accept cookies if present
try:
  cookies = driver.find_element(By.XPATH, "/html/body/div[4]/div/div/div/div[3]/a[1]")
  cookies.click()
except:
  pass


sleep(3)

# Find the Search box
search_box = driver.find_element(
    By.NAME, "search"
)

# Do the search
search_term = f'Detenuti italiani e stranieri presenti e capienze per istituto - aggiornamento al {day} {month} {year}'
search_box.send_keys(search_term)
search_box.send_keys(Keys.RETURN)

sleep(3)

# Find the link to the page we want and click it
target_link = driver.find_element("link text", target)
target_link.click()

# Save the html code

current_page_url = driver.current_url
print(current_page_url)

link_id = re.search(r'Id=(.*)', current_page_url).group(1)
dest = Path("../outputs/raw/bollettini/" + link_id + ".html")

if dest.exists(): # .... load it from file
        print(f"Already have {dest}, loading!")
        page_html = open(dest).read()

else: # ... fetch it and save it
# Get the URL and HTML code
  url = driver.current_url
  page_html = driver.page_source

  with open(dest, 'w') as f:
    f.write(page_html)
    print(f"Page saved in {dest}. Extracting data.")

# Close the browser
driver.quit()

https://www.giustizia.it/giustizia/it/mg_1_14_1.page?contentId=SST462936
Already have ../outputs/raw/bollettini/SST462936.html, loading!


## BeautifulSoup
This section of the notebook uses BeautifulSoup to parse the page html code and extract table information.

In [29]:
columns = [
    'url',
    'dati_aggiornati_al',
    'descrizione',
    'regione_di_detenzione',
    'sigla_provincia',
    'nome_istituto',
    'tipo_istituto',
    'capienza_regolamentare',
    'detenuti_totale',
    'detenuti_donne',
    'detenuti_stranieri'
]

#   ... And parse it with BeautifulSoup
all_data = []

soup = BeautifulSoup(page_html, "html.parser")
# Fix date if wrong
date = soup.find("p", class_="date").text
title = target
date_title = title.split("- ")[1].strip()
date_title = date_title.replace("aggiornamento al ", "").lower()


if date_title == date:
    date = date
else:
    print(f"Date fixed from {date} to {date_title}")
    date = date_title

tables = soup.select("table")
table = tables[0]
row_els = table.select("tbody tr")

table_data = []

for row in row_els:
    row_cells = [url, date, title]
    for i, cell in enumerate(row.select("td")):
        if i < 4:
            row_cells.append(cell.text) # Extract only digits from last four columns
        else:
            number = cell.text.replace('.', '')  # Remove dots
            row_cells.append(number)
                    
        table_data.append(row_cells)

all_data.extend(table_data)

# Creating the df
df = pd.DataFrame(table_data, columns=columns)

df.to_csv('../outputs/raw/data_raw.csv', mode='a', header=False, index=False)