# Scraper institutes
This notebook is a monthly scraper used to retrieve information about condition in detention centers in Italy. To do so, it uses the id numbers of the various detention centers to navigate to the dedicated webpages with Selenium, store locally the html code of the page and then parse it using BeautifuSoup. The information is then stored in a pandas dataframe and saved as a csv file.

In [1]:
import pandas as pd
import requests
import datetime
from bs4 import BeautifulSoup
import asyncio
from playwright.async_api import async_playwright
from time import sleep
from pathlib import Path

In [2]:
# Get current date
current_month = datetime.datetime.now().strftime("%Y-%m")
current_day = datetime.datetime.now().strftime("%Y-%m-%d")

In [3]:
# Collect institutes id numbers
df_institutes = pd.read_csv(f'../outputs/clean/institutes_info_{current_day}.csv')
prison_ids = df_institutes['id_istituto'].tolist()

In [4]:
# Function to grab the html code of the page
async def get_html(prison_id, current_day):

    dest = Path(f"../outputs/raw/snapshots/{current_day}_{prison_id}.html")

    if dest.exists() : #... load it from file
        print(f"Already have {dest}, loading!")
        page_html = open(dest).read()
    else:
        BASE_URL = "https://www.giustizia.it/giustizia/it/dettaglio_scheda.page?s="
        playwright = await async_playwright().start()
    
        try:
            browser = await playwright.firefox.launch()
            context = await browser.new_context(viewport={'width': 1280, 'height': 800})
            page = await context.new_page()
            await page.goto(f"{BASE_URL}{prison_id}")
            print("Fetching " + prison_id)
            page_html = await page.content()
            
            # Stores html code in dest
            dest.write_text(page_html)
        finally:
            await browser.close()
            await playwright.stop()
    
    return page_html

In [5]:
# Function to extract institute name and type
def extract_institute_details(soup):
    institute_name = soup.find('h1', {'class': 'titoloIstituto'}).text.strip()
    institute_type = soup.find('h3', {'class': 'titoloIstituto'}).text.strip()
    return institute_name, institute_type

# Function to extract capacity table data
def extract_capacity_data(soup):
    table_capienze = soup.find_all('table')[0]
    rows = table_capienze.find_all('tr')
    if len(rows) > 1:  # Ensure there are rows in the table
        cells = rows[1].find_all('td') 
        posti_regolamentari = int(cells[0].text.strip())
        posti_non_disponibili = int(cells[1].text.strip())
        totale_detenuti = int(cells[2].text.strip())
    else:
        posti_regolamentari = posti_non_disponibili = totale_detenuti = 0
    return posti_regolamentari, posti_non_disponibili, totale_detenuti

# Function to extract updated date
def extract_updated_date(soup, header_text):
    target_span = soup.find('h2', text=header_text)
    if target_span:
        span = target_span.find_next_sibling('span')
        return span.text.strip() if span else 'NA'
    return 'NA'

def extract_personnel_details(soup, header_text):
    target_span = soup.find('h2', text=header_text)
    if target_span:
        try:
            div = target_span.find_next('div', {'class': 'listaContenutiComplessi'})
            spans = div.find_all('span', {'class': 'valoreSottocampo'})
            return spans[0].text.strip(), spans[1].text.strip(), spans[2].text.strip()
        except:
            return 'NA', 'NA', 'NA'
    return 'NA', 'NA', 'NA'

# Function to extract staff table data
def extract_staff_data(soup):
    table_staff = soup.find_all('table')[1]
    cells = table_staff.find_all('td')
    polizia_penitenziaria_effettivi = int(cells[0].text.strip())
    polizia_penitenziaria_previsti = int(cells[1].text.strip())
    amministrativi_effettivi = int(cells[2].text.strip())
    amministrativi_previsti = int(cells[3].text.strip())
    educatori_effettivi = int(cells[4].text.strip())
    educatori_previsti = int(cells[5].text.strip())
    return (polizia_penitenziaria_effettivi, polizia_penitenziaria_previsti,
            amministrativi_effettivi, amministrativi_previsti,
            educatori_effettivi, educatori_previsti)

# Function to extract last update date
def extract_date_of_last_update(soup):
    # Police staff
    target_span= soup.find('h2', text='personale polizia penitenziaria aggiornato al')
    try:
        span = target_span.find_next_sibling('span')
        personale_polizia_aggiornato_al = span.text.strip()
    except:
        personale_polizia_aggiornato_al = 'NA'

    # Administrative staff
    target_span= soup.find('h2', text='personale amministrativo aggiornato al')
    try:
        span = target_span.find_next_sibling('span')
        personale_amministrativo_aggiornato_al = span.text.strip()
    except:
        personale_amministrativo_aggiornato_al = 'NA'

    return personale_polizia_aggiornato_al, personale_amministrativo_aggiornato_al
    

In [6]:
# Function to extract institute data
def get_prison_data(soup, current_day):

    institute_name, institute_type = extract_institute_details(soup)
    posti_regolamentari, posti_non_disponibili, totale_detenuti = extract_capacity_data(soup)
    dati_aggiornati_al = extract_updated_date(soup, 'dati aggiornati al ')
    asl, first_name_asl, last_name_asl = extract_personnel_details(soup, 'Responsabile ASL per il carcere')
    first_name, last_name, role = extract_personnel_details(soup, 'Direttore')
    (polizia_penitenziaria_effettivi, polizia_penitenziaria_previsti, amministrativi_effettivi, amministrativi_previsti, educatori_effettivi, educatori_previsti) = extract_staff_data(soup)
    personale_polizia_aggiornato_al, personale_amministrativo_aggiornato_al = extract_date_of_last_update(soup)

    prison_data = {
            'prison_id': prison_id,
            'institute_name': institute_name,
            'institute_type': institute_type,
            'posti_regolamentari': posti_regolamentari,
            'posti_non_disponibili': posti_non_disponibili,
            'totale_detenuti': totale_detenuti,
            'dati_aggiornati_al': dati_aggiornati_al,
            'asl': asl,
            'nome_responsabile_asl': first_name_asl,
            'cognome_responsabile_asl': last_name_asl,
            'nome_direttore': first_name,
            'cognome_direttore': last_name,
            'ruolo_direttore': role,
            'polizia_penitenziaria_effettivi': polizia_penitenziaria_effettivi,
            'polizia_penitenziaria_previsti': polizia_penitenziaria_previsti,
            'amministrativi_effettivi': amministrativi_effettivi,
            'amministrativi_previsti': amministrativi_previsti,
            'educatori_effettivi': educatori_effettivi,
            'educatori_previsti': educatori_previsti,
            'personale_polizia_aggiornato_a': personale_polizia_aggiornato_al,
            'personale_amministrativo_aggiornato_al': personale_amministrativo_aggiornato_al,
            'data_extracted_on': current_day
        }
        
    return prison_data

In [7]:
data = []

for prison_id in prison_ids:
    success = False
    for attempt in range(5):
        try:
            html_content = await get_html(prison_id, current_month)
            # Parse the html with BeautifulSoup
            soup = BeautifulSoup(html_content, 'html.parser')
            prison_data = get_prison_data(soup, current_day)

            # Append prison_data to data list
            data.append(prison_data)

            success = True
            break  # Break the retry loop if successful

        except Exception as e:
            print(f"Attempt {attempt+1} failed for prison_id {prison_id}. Error: {e}")
            if attempt < 4:  # If not the last attempt, sleep for 10 seconds before retrying
                sleep(10)

    if not success:
        print(f"Failed to fetch data for prison_id {prison_id} after 5 attempts.")

    # Sleep for 3 seconds before making the next request
    sleep(3)

# Convert prison_data_list to a Pandas DataFrame
data_df = pd.DataFrame(data)

Already have ../outputs/raw/snapshots/2024-06_MII179988.html, loading!


  target_span = soup.find('h2', text=header_text)
  target_span = soup.find('h2', text=header_text)
  target_span= soup.find('h2', text='personale polizia penitenziaria aggiornato al')
  target_span= soup.find('h2', text='personale amministrativo aggiornato al')


Already have ../outputs/raw/snapshots/2024-06_MII172610.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172320.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173712.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173747.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177436.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178027.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178072.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178659.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179237.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII182910.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181346.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181703.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181924.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179733.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173311.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173324.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177996.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179342.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179353.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180415.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181908.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII159053.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172187.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173101.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173784.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172580.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176719.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180426.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181079.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179290.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180487.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179913.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180424.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII159065.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173237.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII174501.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181645.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176424.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181661.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181672.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179818.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181863.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181847.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181891.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII182440.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173088.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII157783.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173114.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173364.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII159069.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173688.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173356.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176437.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173764.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181636.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172814.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176759.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176414.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177134.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176703.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177107.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177323.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176749.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176781.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179331.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176788.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII169535.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180401.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180437.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180380.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180356.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180348.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181870.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181880.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179299.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181371.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181820.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181839.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181424.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158919.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179995.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158924.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172277.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158948.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181678.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII174720.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172623.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176743.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176375.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177498.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177463.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178045.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178035.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178124.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178141.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179938.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179280.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180476.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181377.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181434.html, loading!
Attempt 1 failed for prison_id MII181434. Error: 'NoneType' object has no attribute 'text'


Already have ../outputs/raw/snapshots/2024-06_MII181434.html, loading!
Attempt 2 failed for prison_id MII181434. Error: 'NoneType' object has no attribute 'text'


Already have ../outputs/raw/snapshots/2024-06_MII181434.html, loading!
Attempt 3 failed for prison_id MII181434. Error: 'NoneType' object has no attribute 'text'


Already have ../outputs/raw/snapshots/2024-06_MII181434.html, loading!
Attempt 4 failed for prison_id MII181434. Error: 'NoneType' object has no attribute 'text'


Already have ../outputs/raw/snapshots/2024-06_MII181434.html, loading!
Attempt 5 failed for prison_id MII181434. Error: 'NoneType' object has no attribute 'text'
Failed to fetch data for prison_id MII181434 after 5 attempts.


Already have ../outputs/raw/snapshots/2024-06_MII182919.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181681.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179896.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158935.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179958.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158945.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180458.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181115.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181358.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172835.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158910.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177118.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179224.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172222.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178003.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179274.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181091.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181385.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178148.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179965.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181109.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178666.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178672.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180370.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172592.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158895.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178501.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII174825.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179781.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179932.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181407.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180465.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158944.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158941.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172011.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181392.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177332.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181918.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178085.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178115.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177420.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179830.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179945.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177346.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181440.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII159314.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173006.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173259.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173285.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173301.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173813.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173994.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176451.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176714.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178681.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179324.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179306.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179856.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180451.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181401.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181625.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172332.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179876.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172208.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII177384.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181417.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173063.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII158901.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173341.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII173265.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178700.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172344.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181691.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII160204.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181934.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172827.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176771.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII159060.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII152284.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII176403.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII182339.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII180001.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII174686.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179842.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII172508.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179981.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII179364.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII178638.html, loading!


Already have ../outputs/raw/snapshots/2024-06_MII181125.html, loading!


In [8]:
data_df

Unnamed: 0,prison_id,institute_name,institute_type,posti_regolamentari,posti_non_disponibili,totale_detenuti,dati_aggiornati_al,asl,nome_responsabile_asl,cognome_responsabile_asl,...,ruolo_direttore,polizia_penitenziaria_effettivi,polizia_penitenziaria_previsti,amministrativi_effettivi,amministrativi_previsti,educatori_effettivi,educatori_previsti,personale_polizia_aggiornato_a,personale_amministrativo_aggiornato_al,data_extracted_on
0,MII179988,Reggio Calabria Arghillà,Casa circondariale,294,0,353,17/06/2024,5 Reggio Calabria,Luciano,Lucania,...,Titolare,133,160,12,32,5,8,31/05/2024,31/05/2024,2024-06-21
1,MII172610,Brescia Verziano,Casa di reclusione,71,0,123,17/06/2024,Spedali Civili di Brescia,Luigi,Leone,...,,77,96,2,0,1,0,31/05/2024,31/05/2024,2024-06-21
2,MII172320,Busto Arsizio,Casa circondariale,240,11,440,17/06/2024,Busto Arsizio,Ezia,Iorio,...,Titolare,180,217,14,21,3,5,31/05/2024,31/05/2024,2024-06-21
3,MII173712,Como,Casa circondariale,226,0,433,17/06/2024,Como,Giuseppe,Carrano,...,Titolare,181,236,15,23,3,6,31/05/2024,31/05/2024,2024-06-21
4,MII173747,Cremona,Casa circondariale,394,10,577,17/06/2024,CREMONA,Rossano,Botto,...,Titolare,169,223,16,23,6,6,31/05/2024,31/05/2024,2024-06-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,MII172508,Bologna,Casa circondariale - Rocco D'Amato,498,7,847,17/06/2024,Bologna,Raffaella,Campalastri,...,Titolare,421,541,27,35,10,10,31/05/2024,31/05/2024,2024-06-21
185,MII179981,Ravenna,Casa circondariale,49,0,79,17/06/2024,Romagna,Maurizio,Serra,...,Titolare,66,83,17,14,5,3,31/05/2024,31/05/2024,2024-06-21
186,MII179364,Parma,Casa di reclusione,655,27,698,17/06/2024,PARMA,Choroma,FAISSAL,...,Titolare,355,462,2,0,0,0,31/05/2024,31/05/2024,2024-06-21
187,MII178638,Modena,Casa circondariale,372,0,537,17/06/2024,MODENA,Stefano,Petrella,...,Titolare,210,257,22,27,6,6,31/05/2024,31/05/2024,2024-06-21


In [9]:
old_data_path = Path('../outputs/clean/istituti_data.csv')

if old_data_path.exists():
  old_data = pd.read_csv('../outputs/clean/istituti_data.csv')
  combined_data = pd.concat([old_data, data_df], ignore_index=True)
  combined_data.drop_duplicates(inplace=True)
else:
  combined_data = data_df
  combined_data.drop_duplicates

combined_data.to_csv('../outputs/clean/istituti_data.csv', index=False)