In [None]:
import pandas as pd
import os

In [None]:
aziende_mapping = {
    # Health Districts (ULSS)
    "501": "AZIENDA ULSS N. 1 DOLOMITI",
    "502": "AZIENDA ULSS N. 2 MARCA TREVIGIANA",
    "503": "AZIENDA ULSS N. 3 SERENISSIMA",
    "504": "AZIENDA ULSS N. 4 VENETO ORIENTALE",
    "505": "AZIENDA ULSS N. 5 POLESANA",
    "506": "AZIENDA ULSS N. 6 EUGANEA",
    "507": "AZIENDA ULSS N. 7 PEDEMONTANA",
    "508": "AZIENDA ULSS N. 8 BERICA",
    "509": "AZIENDA ULSS N. 9 SCALIGERA",
    
    # Cities/Locations
    "101": "BELLUNO",
    "102": "FELTRE",
    "103": "BASSANO DEL GRAPPA",
    "104": "THIENE",
    "105": "ARZIGNANO",
    "106": "VICENZA",
    "107": "PIEVE DI SOLIGO",
    "108": "ASOLO",
    "109": "TREVISO",
    "110": "S. DONA' DI PIAVE",
    "112": "VENEZIANA",
    "113": "MIRANO",
    "114": "CHIOGGIA",
    "115": "CITTADELLA",
    "116": "PADOVA",
    "117": "ESTE",
    "118": "ROVIGO",
    "119": "ADRIA",
    "120": "VERONA",
    "121": "LEGNAGO",
    "122": "BUSSOLENGO",
    
    # University Hospitals
    "901": "AZIENDA OSPEDALE - UNIVERSITÀ PADOVA",
    "912": "A.O. INTEGRATA CON UNIVERSITÀ - VERONA"
}

anni_mapping = {
    1: 2015,
    2: 2016,
    3: 2017,
    4: 2018,
    5: 2019,
    6: 2020,
    7: 2021,
    8: 2022,
    9: 2023
}


In [None]:
# Get all CSV files in the raw directory
raw_dir = '../output/raw/'
clean_dir = '../output/clean/'

In [None]:
# Process each CSV file
for filename in os.listdir(raw_dir):
    if filename.endswith('.csv'):
        print(f"\nProcessing {filename}")
        try:
            # Read the CSV
            df = pd.read_csv(os.path.join(raw_dir, filename))
            print(f"Original columns: {df.columns.tolist()}")
            
            # First clean-up step
            df['anno'] = df['anno'].map(anni_mapping)
            df['nome'] = df['nome_azienda'].astype(str).map(aziende_mapping)
            df['valori'] = df['valori'].fillna(0)
            df['valori'] = df['valori'].astype(int)
            
            # Check if 'categoria' column exists
            if 'categoria' in df.columns:
                print(f"Creating pivot table for {filename}")
                print(f"Shape before pivot: {df.shape}")
                
                # Create pivot table in a separate DataFrame
                pivot_df = pd.pivot_table(
                    df,
                    index=['anno', 'nome'],
                    columns='categoria',
                    values='valori',
                    aggfunc='first'
                ).reset_index()
                
                # Replace original df with pivot_df
                df = pivot_df
                print(f"Shape after pivot: {df.shape}")
            
            # Clean up columns - fixed the strip() issue
            df = df.fillna(0)
            df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace("'", '').str.replace('"', '').str.strip()
            df['nome'] = df['nome'].str.title().str.strip()
            
            print(f"Final columns: {df.columns.tolist()}")
            
            # Save processed file
            clean_filename = filename.replace('.csv', '_clean.csv')
            output_filename = os.path.join(clean_dir, clean_filename)
            
            df.to_csv(output_filename, index=False, encoding='utf-8')
            print(f"Saved processed file to {output_filename}")  # Changed from clean_filename to output_filename
            print(f"Final shape: {df.shape}")
            
        except Exception as e:
            print(f"Error processing {filename}")
            print(f"Error details: {str(e)}")
            print(f"Current DataFrame state:\n{df.head()}")

print("Processing complete!")