In [None]:
import pandas as pd
import os
from io import StringIO

def read_and_clean_data(file_path):
    """
    Read the data file and clean the content
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
        
        # Clean the data by removing extra quotes and fixing line endings
        data = data.replace('\\', '\n').replace('\"\"', '\"')
        
        # Read the cleaned data into a DataFrame
        df = pd.read_csv(StringIO(data), sep=';', quotechar='"')
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return None

def transform_to_structured_format(df_rtf):
    """
    Transform the RTF data into a structured format with parties and candidates
    """
    # Get unique candidates and parties
    candidates = df_rtf[['COGNOME', 'NOME']].drop_duplicates()
    parties = df_rtf['LISTA'].unique()
    
    # Create the new DataFrame
    new_df = pd.DataFrame()
    
    # Add basic columns
    unique_comuni = df_rtf['COMUNE'].unique()
    new_df['comune'] = unique_comuni
    new_df['provincia'] = df_rtf.groupby('COMUNE')['PROVINCIA'].first()
    new_df['regione'] = df_rtf.groupby('COMUNE')['REGIONE'].first()
    new_df['url'] = ''
    
    # Group by comune to get votes data
    grouped = df_rtf.groupby(['COMUNE', 'LISTA', 'COGNOME', 'NOME']).agg({
        'VOTI_LISTA': 'sum',
        'VOTI_CANDIDATO': 'sum',
        'VOTANTI': 'first'
    }).reset_index()
    
    # Add columns for each party and candidate
    for party in parties:
        new_df[party] = 0
        new_df[f'{party}_perc'] = 0.0
        
    for _, candidate in candidates.iterrows():
        candidate_name = f"{candidate['COGNOME']} {candidate['NOME']}"
        new_df[f'{candidate_name}_total_votes'] = 0
        new_df[f'{candidate_name}_total_perc'] = 0.0
    
    # Fill in the values
    for idx, row in grouped.iterrows():
        comune = row['COMUNE']
        party = row['LISTA']
        candidate_name = f"{row['COGNOME']} {row['NOME']}"
        party_votes = row['VOTI_LISTA']
        candidate_votes = row['VOTI_CANDIDATO']
        total_voters = row['VOTANTI']
        
        # Find the index for this comune
        comune_idx = new_df[new_df['comune'] == comune].index[0]
        
        # Update party votes and percentage
        new_df.loc[comune_idx, party] = party_votes
        party_percentage = round((party_votes / total_voters * 100), 2)
        new_df.loc[comune_idx, f'{party}_perc'] = party_percentage
        
        # Update candidate votes and percentage
        new_df.loc[comune_idx, f'{candidate_name}_total_votes'] = candidate_votes
        candidate_percentage = round((candidate_votes / total_voters * 100), 2)
        new_df.loc[comune_idx, f'{candidate_name}_total_perc'] = candidate_percentage
    
    return new_df

def main():
    # File paths
    txt_file_path = '../data/regionali-20200126/regionali-20200126.txt'
    
    # Read the input file
    df_rtf = read_and_clean_data(txt_file_path)
    if df_rtf is None:
        return
    
    # Print initial data for verification
    print("\nOriginal data preview:")
    print(df_rtf.head())
    print("\nUnique parties:", df_rtf['LISTA'].unique())
    print("\nUnique candidates:", df_rtf[['COGNOME', 'NOME']].drop_duplicates().values.tolist())
    
    # Transform the data
    result_df = transform_to_structured_format(df_rtf)
    
    # Save the result
    output_path = '../data/transformed_data.csv'
    try:
        result_df.to_csv(output_path, index=False)
        print(f"\nData successfully transformed and saved to {output_path}")
        print("\nColumns in transformed data:")
        print(result_df.columns.tolist())
        print("\nSample of transformed data:")
        print(result_df.head())
    except Exception as e:
        print(f"Error saving transformed data: {str(e)}")

if __name__ == "__main__":
    main()

## Now mergin with the new data

In [1]:
import pandas as pd

In [2]:
# Read the transformed data
df = pd.read_csv('../data/transformed_data.csv')

# Print original columns to verify we have the right candidate columns
print("Original columns:")
print(df.columns.tolist())

# Select and rename the columns we want
clean_df = df[['comune', 
               'BONACCINI STEFANO_total_votes', 'BONACCINI STEFANO_total_perc',
               'BORGONZONI LUCIA_total_votes', 'BORGONZONI LUCIA_total_perc']]

# Rename columns to make them cleaner
clean_df = clean_df.rename(columns={
    'BONACCINI STEFANO_total_votes': 'CSX_votes_2020',
    'BONACCINI STEFANO_total_perc': 'CSX_perc_2020',
    'BORGONZONI LUCIA_total_votes': 'CDX_votes_2020',
    'BORGONZONI LUCIA_total_perc': 'CDX_perc_2020'
})

# Display the first few rows of the cleaned dataset
print("\nCleaned data preview:")
print(clean_df.head())

# Display some basic statistics
print("\nBasic statistics:")
print(clean_df.describe())

clean_df['comune'] = clean_df['comune'].str.strip().str.title()

Original columns:
['comune', 'provincia', 'regione', 'url', "L'ALTRA EMILIA ROMAGNA", "L'ALTRA EMILIA ROMAGNA_perc", "MOVIMENTO 3V VACCINI VOGLIAMO VERITA'", "MOVIMENTO 3V VACCINI VOGLIAMO VERITA'_perc", 'POTERE AL POPOLO!', 'POTERE AL POPOLO!_perc', 'MOVIMENTO 5 STELLE', 'MOVIMENTO 5 STELLE_perc', 'PARTITO DEMOCRATICO', 'PARTITO DEMOCRATICO_perc', 'VOLT EMILIA-ROMAGNA', 'VOLT EMILIA-ROMAGNA_perc', '+EUROPA - PSI - PRI', '+EUROPA - PSI - PRI_perc', 'BONACCINI PRESIDENTE', 'BONACCINI PRESIDENTE_perc', 'EMILIA-ROMAGNA CORAGGIOSA ECOLOGISTA PROGRESSISTA', 'EMILIA-ROMAGNA CORAGGIOSA ECOLOGISTA PROGRESSISTA_perc', 'EUROPA VERDE', 'EUROPA VERDE_perc', 'PARTITO COMUNISTA', 'PARTITO COMUNISTA_perc', 'LEGA', 'LEGA_perc', 'FORZA ITALIA', 'FORZA ITALIA_perc', 'PROGETTO EMILIA-ROMAGNA RETE CIVICA BORGONZONI PRESIDENTE', 'PROGETTO EMILIA-ROMAGNA RETE CIVICA BORGONZONI PRESIDENTE_perc', "FRATELLI D'ITALIA", "FRATELLI D'ITALIA_perc", 'IL POPOLO DELLA FAMIGLIA - CAMBIAMO!', 'IL POPOLO DELLA FAMIGLIA -

In [3]:
df2 = pd.read_csv('../output/risultati_er.csv')

# Print original columns to verify we have the right candidate columns
print("Original columns:")
print(df.columns.tolist())

# Select and rename the columns we want
clean_df2 = df2[['comune', 'provincia', 'regione',
               'MICHELE DE PASCALE_total_votes', 'MICHELE DE PASCALE_total_perc',
               'ELENA UGOLINI_total_votes', 'ELENA UGOLINI_total_perc']]

# Rename columns to make them cleaner
clean_df2 = clean_df2.rename(columns={
    'MICHELE DE PASCALE_total_votes': 'CSX_votes_2024',
    'MICHELE DE PASCALE_total_perc': 'CSX_perc_2024',
    'ELENA UGOLINI_total_votes': 'CDX_votes_2024',
    'ELENA UGOLINI_total_perc': 'CDX_perc_2024'
})

# Display the first few rows of the cleaned dataset
print("\nCleaned data preview:")
print(clean_df2.head())

# Display some basic statistics
print("\nBasic statistics:")
print(clean_df2.describe())

Original columns:
['comune', 'provincia', 'regione', 'url', "L'ALTRA EMILIA ROMAGNA", "L'ALTRA EMILIA ROMAGNA_perc", "MOVIMENTO 3V VACCINI VOGLIAMO VERITA'", "MOVIMENTO 3V VACCINI VOGLIAMO VERITA'_perc", 'POTERE AL POPOLO!', 'POTERE AL POPOLO!_perc', 'MOVIMENTO 5 STELLE', 'MOVIMENTO 5 STELLE_perc', 'PARTITO DEMOCRATICO', 'PARTITO DEMOCRATICO_perc', 'VOLT EMILIA-ROMAGNA', 'VOLT EMILIA-ROMAGNA_perc', '+EUROPA - PSI - PRI', '+EUROPA - PSI - PRI_perc', 'BONACCINI PRESIDENTE', 'BONACCINI PRESIDENTE_perc', 'EMILIA-ROMAGNA CORAGGIOSA ECOLOGISTA PROGRESSISTA', 'EMILIA-ROMAGNA CORAGGIOSA ECOLOGISTA PROGRESSISTA_perc', 'EUROPA VERDE', 'EUROPA VERDE_perc', 'PARTITO COMUNISTA', 'PARTITO COMUNISTA_perc', 'LEGA', 'LEGA_perc', 'FORZA ITALIA', 'FORZA ITALIA_perc', 'PROGETTO EMILIA-ROMAGNA RETE CIVICA BORGONZONI PRESIDENTE', 'PROGETTO EMILIA-ROMAGNA RETE CIVICA BORGONZONI PRESIDENTE_perc', "FRATELLI D'ITALIA", "FRATELLI D'ITALIA_perc", 'IL POPOLO DELLA FAMIGLIA - CAMBIAMO!', 'IL POPOLO DELLA FAMIGLIA -

In [6]:
clean_df.head(2)

Unnamed: 0,comune,CSX_votes_2020,CSX_perc_2020,CDX_votes_2020,CDX_perc_2020
0,Saludecio,523,36.47,759,52.93
1,San Clemente,1011,39.32,1304,50.72


In [7]:
merged_df = pd.merge(clean_df, clean_df2, on='comune', how='inner')

In [9]:
merged_df['CSX_perc_2024'] = merged_df['CSX_perc_2024'].str.replace(',','.').astype(float)
merged_df['CDX_perc_2024'] = merged_df['CDX_perc_2024'].str.replace(',','.').astype(float)

merged_df.sample(5)

Unnamed: 0,comune,CSX_votes_2020,CSX_perc_2020,CDX_votes_2020,CDX_perc_2020,provincia,regione,CSX_votes_2024,CSX_perc_2024,CDX_votes_2024,CDX_perc_2024
93,Sarsina,755,37.69,1100,54.92,Forli'-Cesena,Emilia-Romagna,551,41.49,753,56.7
287,Lugagnano Val D'Arda,679,29.73,1491,65.28,Piacenza,Emilia-Romagna,497,35.32,890,63.26
294,Podenzano,1573,31.71,3064,61.76,Piacenza,Emilia-Romagna,1199,37.72,1943,61.12
167,Lesignano De' Bagni,1064,37.41,1595,56.08,Parma,Emilia-Romagna,835,47.02,872,49.1
233,Castell'Arquato,831,30.83,1725,64.01,Piacenza,Emilia-Romagna,619,36.37,1037,60.93


In [None]:
# numeric_columns = ['CSX_votes_2020', 'CDX_votes_2020', 
#                   'CSX_votes_2024', 'CDX_votes_2024']

# for col in numeric_columns:
#     merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')


In [10]:
merged_df['margin_2020'] = merged_df['CSX_perc_2020'] - merged_df['CDX_perc_2020']
merged_df['margin_2024'] = merged_df['CSX_perc_2024'] - merged_df['CDX_perc_2024']
merged_df['margin_shift'] = merged_df['margin_2024'] - merged_df['margin_2020']

In [11]:
# Remove rows where 'comune' is either 'Sassofeltrio' or 'Montecopiolo' as not present in Datawrapper
merged_df = merged_df[~merged_df['comune'].isin(['Sassofeltrio', 'Montecopiolo'])]

# Basic cleaning for Datawrapper integration
merged_df['comune'] = merged_df['comune'].str.replace("Forli'", "Forlì", regex=False)
merged_df['comune'] = merged_df['comune'].str.replace("Montescudo - Monte Colombo", "Montescudo-Monte Colombo", regex=False)

In [12]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

In [13]:
# Create the geocoder
geolocator = Nominatim(user_agent='my_geocoder')
# Create a delay between requests to respect usage limits
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def get_location(row):
    try:
        # Create a complete address string
        address = f"{row['comune']}, {row['provincia']}, {row['regione']}, Italy"
        location = geocode(address)
        if location:
            return pd.Series([location.latitude, location.longitude])
        else:
            return pd.Series([None, None])
    except Exception as e:
        print(f"Error geocoding {row['comune']}: {str(e)}")
        return pd.Series([None, None])

In [14]:
# Add coordinates
print("Starting geocoding process...")
merged_df[['latitude', 'longitude']] = merged_df.apply(get_location, axis=1)

# Check for any missing coordinates
missing_coords = merged_df[merged_df['latitude'].isna() | merged_df['longitude'].isna()]
if not missing_coords.empty:
    print("\nWarning: Could not find coordinates for these comuni:")
    print(missing_coords[['comune', 'provincia']])

# Display sample of results
print("\nSample of geocoded results:")
print(merged_df[['comune', 'provincia', 'latitude', 'longitude']].head())


Starting geocoding process...

Sample of geocoded results:
                      comune provincia   latitude  longitude
0                  Saludecio    Rimini  43.872685  12.668507
1               San Clemente    Rimini  43.932205  12.625693
2  San Giovanni In Marignano    Rimini  43.938668  12.712494
3                    San Leo    Rimini  43.896961  12.343644
4         Sant'Agata Feltria    Rimini  43.864302  12.207999


In [15]:
merged_df.sample(3)

Unnamed: 0,comune,CSX_votes_2020,CSX_perc_2020,CDX_votes_2020,CDX_perc_2020,provincia,regione,CSX_votes_2024,CSX_perc_2024,CDX_votes_2024,CDX_perc_2024,margin_2020,margin_2024,margin_shift,latitude,longitude
148,Zocca,968,39.17,1344,54.39,Modena,Emilia-Romagna,719,45.42,825,52.12,-15.22,-6.7,8.52,44.34624,10.993817
125,Novi Di Modena,2815,53.43,2088,39.63,Modena,Emilia-Romagna,2186,63.33,1162,33.66,13.8,29.67,15.87,44.893439,10.901253
10,Castel Bolognese,2493,48.25,2287,44.26,Ravenna,Emilia-Romagna,1956,53.21,1637,44.53,3.99,8.68,4.69,44.320487,11.799258


In [22]:
merged_df['margin_2020'] = merged_df['CDX_perc_2020'] - merged_df['CSX_perc_2020']
merged_df['margin_2024'] = merged_df['CDX_perc_2024'] - merged_df['CSX_perc_2024']
merged_df['margin_shift'] = merged_df['margin_2024'] - merged_df['margin_2020']

In [23]:
merged_df.to_csv('../output/viz/margini_elettorali_ER.csv', index=False, encoding='UTF-8')