In [None]:
import pandas as pd
from io import StringIO

def read_and_clean_data(data):
    # Clean data and convert to DataFrame
    df = pd.read_csv(StringIO(data), sep=';', quotechar='"')
    
    # Convert vote columns to numeric
    numeric_cols = ['VOTI_LISTA', 'VOTI_CANDIDATO', 'VOTANTI']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col].str.replace(',', '.'), errors='coerce')
    
    return df

def transform_to_structured_format(df):
    # Get unique candidates and parties
    candidates = df[['COGNOME', 'NOME']].drop_duplicates()
    parties = df['LISTA'].unique()
    
    # Create base DataFrame with comuni
    result_df = pd.DataFrame()
    result_df['comune'] = df['COMUNE'].unique()
    result_df['provincia'] = df.groupby('COMUNE')['PROVINCIA'].first()
    result_df['regione'] = df.groupby('COMUNE')['REGIONE'].first()
    
    # Group votes by comune
    grouped = df.groupby(['COMUNE', 'LISTA', 'COGNOME', 'NOME']).agg({
        'VOTI_LISTA': 'sum',
        'VOTI_CANDIDATO': 'sum',
        'VOTANTI': 'first'
    }).reset_index()
    
    # Add columns for each party and candidate
    for _, candidate in candidates.iterrows():
        candidate_name = f"{candidate['COGNOME']} {candidate['NOME']}"
        result_df[f'{candidate_name}_total_votes'] = 0
        result_df[f'{candidate_name}_total_perc'] = 0.0
    
    for party in parties:
        result_df[party] = 0
        result_df[f'{party}_perc'] = 0.0
    
    # Fill in values
    for _, row in grouped.iterrows():
        comune_idx = result_df[result_df['comune'] == row['COMUNE']].index[0]
        party = row['LISTA']
        candidate_name = f"{row['COGNOME']} {row['NOME']}"
        
        # Update party votes and percentage
        result_df.loc[comune_idx, party] = row['VOTI_LISTA']
        if row['VOTANTI'] > 0:
            result_df.loc[comune_idx, f'{party}_perc'] = round(row['VOTI_LISTA'] / row['VOTANTI'] * 100, 2)
        
        # Update candidate votes and percentage
        result_df.loc[comune_idx, f'{candidate_name}_total_votes'] = row['VOTI_CANDIDATO']
        if row['VOTANTI'] > 0:
            result_df.loc[comune_idx, f'{candidate_name}_total_perc'] = round(row['VOTI_CANDIDATO'] / row['VOTANTI'] * 100, 2)
    
    return result_df

def main():
    # Read data
    with open('../data/regionali-20191027/regionali-20191027.txt', 'r', encoding='utf-8') as file:
        data = file.read()
    
    df = read_and_clean_data(data)
    result_df = transform_to_structured_format(df)
    
    # Save processed datas
    result_df.to_csv('../output/risultati_um_2019.csv', index=False, encoding='UTF-8')
    print("Data saved as: elezioni_regionali_umbria.csv")

if __name__ == "__main__":
    main()

Data saved as: elezioni_regionali_umbria.csv


## Now mergin with the new data

In [3]:
df = pd.read_csv('../output/risultati_um_2019.csv')
df.columns

Index(['comune', 'provincia', 'regione', 'PAPPALARDO ANTONIO_total_votes',
       'PAPPALARDO ANTONIO_total_perc', 'RICCI CLAUDIO_total_votes',
       'RICCI CLAUDIO_total_perc', 'RUBICONDI ROSSANO_total_votes',
       'RUBICONDI ROSSANO_total_perc', 'CARLETTI MARTINA_total_votes',
       'CARLETTI MARTINA_total_perc', 'TESEI DONATELLA_total_votes',
       'TESEI DONATELLA_total_perc', 'BIANCONI VINCENZO_total_votes',
       'BIANCONI VINCENZO_total_perc', 'CAMUZZI EMILIANO_total_votes',
       'CAMUZZI EMILIANO_total_perc', 'CIRILLO GIUSEPPE_total_votes',
       'CIRILLO GIUSEPPE_total_perc', 'GILET ARANCIONI',
       'GILET ARANCIONI_perc', 'RICCI PRESIDENTE', 'RICCI PRESIDENTE_perc',
       'ITALIA CIVICA', 'ITALIA CIVICA_perc', 'PROPOSTA UMBRIA',
       'PROPOSTA UMBRIA_perc', 'PARTITO COMUNISTA', 'PARTITO COMUNISTA_perc',
       'RICONQUISTARE L'ITALIA', 'RICONQUISTARE L'ITALIA_perc', 'FORZA ITALIA',
       'FORZA ITALIA_perc', 'TESEI PRESIDENTE', 'TESEI PRESIDENTE_perc',
       '

In [4]:
# Read the transformed data
df = pd.read_csv('../output/risultati_um_2019.csv')

# Print original columns to verify we have the right candidate columns
print("Original columns:")
print(df.columns.tolist())

# Select and rename the columns we want
clean_df = df[['comune', 
               'BIANCONI VINCENZO_total_votes', 'BIANCONI VINCENZO_total_perc',
               'TESEI DONATELLA_total_votes', 'TESEI DONATELLA_total_perc']]

# Rename columns to make them cleaner
clean_df = clean_df.rename(columns={
    'BIANCONI VINCENZO_total_votes': 'CSX_votes_2019',
    'BIANCONI VINCENZO_total_perc': 'CSX_perc_2019',
    'TESEI DONATELLA_total_votes': 'CDX_votes_2019',
    'TESEI DONATELLA_total_perc': 'CDX_perc_2019'
})

# Display the first few rows of the cleaned dataset
print("\nCleaned data preview:")
print(clean_df.head())

# Display some basic statistics
print("\nBasic statistics:")
print(clean_df.describe())

clean_df['comune'] = clean_df['comune'].str.strip().str.title()

Original columns:
['comune', 'provincia', 'regione', 'PAPPALARDO ANTONIO_total_votes', 'PAPPALARDO ANTONIO_total_perc', 'RICCI CLAUDIO_total_votes', 'RICCI CLAUDIO_total_perc', 'RUBICONDI ROSSANO_total_votes', 'RUBICONDI ROSSANO_total_perc', 'CARLETTI MARTINA_total_votes', 'CARLETTI MARTINA_total_perc', 'TESEI DONATELLA_total_votes', 'TESEI DONATELLA_total_perc', 'BIANCONI VINCENZO_total_votes', 'BIANCONI VINCENZO_total_perc', 'CAMUZZI EMILIANO_total_votes', 'CAMUZZI EMILIANO_total_perc', 'CIRILLO GIUSEPPE_total_votes', 'CIRILLO GIUSEPPE_total_perc', 'GILET ARANCIONI', 'GILET ARANCIONI_perc', 'RICCI PRESIDENTE', 'RICCI PRESIDENTE_perc', 'ITALIA CIVICA', 'ITALIA CIVICA_perc', 'PROPOSTA UMBRIA', 'PROPOSTA UMBRIA_perc', 'PARTITO COMUNISTA', 'PARTITO COMUNISTA_perc', "RICONQUISTARE L'ITALIA", "RICONQUISTARE L'ITALIA_perc", 'FORZA ITALIA', 'FORZA ITALIA_perc', 'TESEI PRESIDENTE', 'TESEI PRESIDENTE_perc', 'LEGA', 'LEGA_perc', 'UMBRIA CIVICA', 'UMBRIA CIVICA_perc', "FRATELLI D'ITALIA", "FRATE

In [7]:
df2 = pd.read_csv('../output/risultati_um.csv')

# Print original columns to verify we have the right candidate columns
print("Original columns:")
print(df.columns.tolist())

# Select and rename the columns we want
clean_df2 = df2[['comune', 'provincia', 'regione',
               'STEFANIA PROIETTI_total_votes', 'STEFANIA PROIETTI_total_perc',
               'DONATELLA TESEI_total_votes', 'DONATELLA TESEI_total_perc']]

# Rename columns to make them cleaner
clean_df2 = clean_df2.rename(columns={
    'STEFANIA PROIETTI_total_votes': 'CSX_votes_2024',
    'STEFANIA PROIETTI_total_perc': 'CSX_perc_2024',
    'DONATELLA TESEI_total_votes': 'CDX_votes_2024',
    'DONATELLA TESEI_total_perc': 'CDX_perc_2024'
})

# Display the first few rows of the cleaned dataset
print("\nCleaned data preview:")
print(clean_df2.head())

# Display some basic statistics
print("\nBasic statistics:")
print(clean_df2.describe())

Original columns:
['comune', 'provincia', 'regione', 'PAPPALARDO ANTONIO_total_votes', 'PAPPALARDO ANTONIO_total_perc', 'RICCI CLAUDIO_total_votes', 'RICCI CLAUDIO_total_perc', 'RUBICONDI ROSSANO_total_votes', 'RUBICONDI ROSSANO_total_perc', 'CARLETTI MARTINA_total_votes', 'CARLETTI MARTINA_total_perc', 'TESEI DONATELLA_total_votes', 'TESEI DONATELLA_total_perc', 'BIANCONI VINCENZO_total_votes', 'BIANCONI VINCENZO_total_perc', 'CAMUZZI EMILIANO_total_votes', 'CAMUZZI EMILIANO_total_perc', 'CIRILLO GIUSEPPE_total_votes', 'CIRILLO GIUSEPPE_total_perc', 'GILET ARANCIONI', 'GILET ARANCIONI_perc', 'RICCI PRESIDENTE', 'RICCI PRESIDENTE_perc', 'ITALIA CIVICA', 'ITALIA CIVICA_perc', 'PROPOSTA UMBRIA', 'PROPOSTA UMBRIA_perc', 'PARTITO COMUNISTA', 'PARTITO COMUNISTA_perc', "RICONQUISTARE L'ITALIA", "RICONQUISTARE L'ITALIA_perc", 'FORZA ITALIA', 'FORZA ITALIA_perc', 'TESEI PRESIDENTE', 'TESEI PRESIDENTE_perc', 'LEGA', 'LEGA_perc', 'UMBRIA CIVICA', 'UMBRIA CIVICA_perc', "FRATELLI D'ITALIA", "FRATE

In [8]:
merged_df = pd.merge(clean_df, clean_df2, on='comune', how='inner')

In [9]:
merged_df['CSX_perc_2024'] = merged_df['CSX_perc_2024'].str.replace(',','.').astype(float)
merged_df['CDX_perc_2024'] = merged_df['CDX_perc_2024'].str.replace(',','.').astype(float)

merged_df.sample(5)

Unnamed: 0,comune,CSX_votes_2019,CSX_perc_2019,CDX_votes_2019,CDX_perc_2019,provincia,regione,CSX_votes_2024,CSX_perc_2024,CDX_votes_2024,CDX_perc_2024
35,Castiglione Del Lago,3367,46.29,3440,47.3,Terni,Umbria,3393,62.19,1935,35.47
49,Gualdo Tadino,2539,31.55,5066,62.95,Terni,Umbria,2403,39.58,3565,58.72
22,Porano,339,33.9,606,60.6,Terni,Umbria,439,52.7,368,44.18
40,Collazzone,510,28.32,1116,61.97,Terni,Umbria,655,46.49,730,51.81
55,Monte Castello Di Vibio,243,27.87,497,57.0,Terni,Umbria,383,54.79,299,42.78


In [11]:
merged_df['margin_2020'] = merged_df['CSX_perc_2019'] - merged_df['CDX_perc_2019']
merged_df['margin_2024'] = merged_df['CSX_perc_2024'] - merged_df['CDX_perc_2024']
merged_df['margin_shift'] = merged_df['margin_2024'] - merged_df['margin_2020']

In [12]:
# Basic cleaning for Datawrapper integration
merged_df['comune'] = merged_df['comune'].str.replace("Citta'", "Città", regex=False)

### geolocating

In [13]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

In [14]:
# Create the geocoder
geolocator = Nominatim(user_agent='my_geocoder')
# Create a delay between requests to respect usage limits
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def get_location(row):
    try:
        # Create a complete address string
        address = f"{row['comune']}, {row['provincia']}, {row['regione']}, Italy"
        location = geocode(address)
        if location:
            return pd.Series([location.latitude, location.longitude])
        else:
            return pd.Series([None, None])
    except Exception as e:
        print(f"Error geocoding {row['comune']}: {str(e)}")
        return pd.Series([None, None])

In [15]:
# Add coordinates
print("Starting geocoding process...")
merged_df[['latitude', 'longitude']] = merged_df.apply(get_location, axis=1)

# Check for any missing coordinates
missing_coords = merged_df[merged_df['latitude'].isna() | merged_df['longitude'].isna()]
if not missing_coords.empty:
    print("\nWarning: Could not find coordinates for these comuni:")
    print(missing_coords[['comune', 'provincia']])

# Display sample of results
print("\nSample of geocoded results:")
print(merged_df[['comune', 'provincia', 'latitude', 'longitude']].head())


Starting geocoding process...

                        comune provincia
28                Bastia Umbra     Terni
29                     Bettona     Terni
30                     Bevagna     Terni
31       Campello Sul Clitunno     Terni
32                     Cannara     Terni
33                      Cascia     Terni
34              Castel Ritaldi     Terni
35        Castiglione Del Lago     Terni
36          Cerreto Di Spoleto     Terni
37                     Citerna     Terni
39           Città Di Castello     Terni
40                  Collazzone     Terni
41                    Corciano     Terni
42                 Costacciaro     Terni
44                     Foligno     Terni
45             Fossato Di Vico     Terni
47           Giano Dell'Umbria     Terni
48             Gualdo Cattaneo     Terni
49               Gualdo Tadino     Terni
50                      Gubbio     Terni
51            Lisciano Niccone     Terni
52                     Magione     Terni
55     Monte Castello Di V

In [17]:
merged_df['margin_2019'] = merged_df['CDX_perc_2019'] - merged_df['CSX_perc_2019']
merged_df['margin_2024'] = merged_df['CDX_perc_2024'] - merged_df['CSX_perc_2024']
merged_df['margin_shift'] = merged_df['margin_2024'] - merged_df['margin_2020']

In [18]:
merged_df.to_csv('../output/viz/margini_elettorali_UM.csv', index=False, encoding='UTF-8')