# Motivation

Our research utilizes a comprehensive dataset of European Parliament voting records. This dataset captures how each Member of the European Parliament (MEP) voted on proposed legislation, along with detailed information about the legislation itself, the MEPs, and their party affiliations.

We selected this dataset because of its rich potential for political analysis. Our aim is to examine whether recent trends toward political polarization and right-leaning policies are reflected in voting patterns within the European Parliament.

The primary objective of our study is to test our hypothesis that political polarization has increased in contemporary European politics, as potentially evidenced through parliamentary voting behavior.

In [1]:
import pandas as pd

def clean_name(first_name, last_name):
    import unicodedata
    
    if not isinstance(first_name, str):
        first_name = str(first_name) if first_name is not None else ""
    if not isinstance(last_name, str):
        last_name = str(last_name) if last_name is not None else ""
    
    first_name = first_name.lower().strip()
    last_name = last_name.lower().strip()
    
    def normalize_chars(text):
        text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
        return text
    
    first_name = normalize_chars(first_name)
    last_name = normalize_chars(last_name)
    
    for char in ['-', "'", "`", ".", ",", "&", "'"]:  # Added apostrophe variants
        first_name = first_name.replace(char, ' ')
        last_name = last_name.replace(char, ' ')
    
    while '  ' in first_name:
        first_name = first_name.replace('  ', ' ')
    while '  ' in last_name:
        last_name = last_name.replace('  ', ' ')
        
    first_name = ' '.join(word.capitalize() for word in first_name.split())
    last_name = ' '.join(word.capitalize() for word in last_name.split())
    
    full_name = f"{first_name} {last_name}".strip()
    
    return full_name

def clean_text(text):
    if not isinstance(text, str):
        return text
  
    text = text.lower()
    
    for char in ['&', ',', '-']:
        text = text.replace(char, ' ')
    
    text = text.replace(' and ', ' ')
    
    while '  ' in text:
        text = text.replace('  ', ' ')
    
    return text.strip()    

def process_ep_voting_data(rcv_files, voted_docs_files):

    if len(rcv_files) != len(voted_docs_files):
        raise ValueError("The lists of RCV files and Voted docs files must have the same length")
     
    all_data = []
    
    for i, (rcv_file, voted_doc_file) in enumerate(zip(rcv_files, voted_docs_files)):
        print(f"Processing files {i+1}/{len(rcv_files)}: {rcv_file} and {voted_doc_file}")
        
        if "EP6" in rcv_file:
            ep_session = "EP6"
            vote_start_index = 10
            rcv_data = pd.read_excel(rcv_file, header=1)
        elif "EP7" in rcv_file:
            ep_session = "EP7"
            vote_start_index = 9
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        elif "EP8" in rcv_file:
            ep_session = "EP8"
            vote_start_index = 9
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        elif "EP9" in rcv_file:
            ep_session = "EP9"
            vote_start_index = 10
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        else:
            ep_session = "Unknown"
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
            print("UNKNOWN SESSION")

        rcv_data = rcv_data.dropna(how='all')
        
        voted_docs = pd.read_excel(voted_doc_file)


        # Get vote columns headers (index)
        vote_columns = rcv_data.columns[vote_start_index:].tolist()
       
        votes_df = process_votes_ep(rcv_data, voted_docs, vote_columns, ep_session=ep_session)

        print(f"Should be total length: {len(rcv_data) * len(voted_docs)}")
        print(f"Got length: {len(votes_df)}")      

        # Add EP session information
        votes_df['ep_session'] = ep_session
        
        # Append to the list of results
        all_data.append(votes_df)
    
    # Concatenate all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Perform final cleaning
    combined_df = clean_combined_data(combined_df)
    
    return combined_df


def process_votes_ep(rcv_data, voted_docs, vote_columns, ep_session = None):
    """Process voting data for EP7, EP8, EP9 sessions"""

    total_skipped = 0

    if ep_session == 'EP6':
        date = 'date'
        title = 'title'
        policy_area = 'main_policy_name'
        vote_id_key = 'euro_act_id'
        author = 'author_name'

        mep_id_key = 'WebisteEpID'

    else:
        date = 'Date'
        title = 'Title'
        policy_area = 'De'
        vote_id_key = 'Vote ID'
        author = 'Author'

        mep_id_key = 'WebisteEpID'

        if ep_session == 'EP7':
            mep_id_key = 'MEP ID'

        if ep_session == 'EP8':
            policy_area = "De/Policy area"

        elif ep_session == 'EP9':
            policy_area = 'Policy area'
  
    
    # Create a dictionary to map vote IDs to vote information
    vote_info = {}
    for _, row in voted_docs.iterrows():

        vote_info[str(row[vote_id_key])] = {
            'date': row[date],
            'title': row[title],
            'policy_area': row[policy_area],
            'author': author,
        }
    
    # Create a list to store results
    results = []
    
    # Process each MEP's votes
    for _, mep_row in rcv_data.iterrows():
        country = mep_row['Country']
        party = mep_row['Party']
        epg = mep_row['EPG']

        first_name = mep_row['Fname']
        last_name = mep_row['Lname']
        
        mep_id = mep_row[mep_id_key]
    
        # Process each vote for this MEP
        for vote_col in vote_columns:
            
            vote_col = str(vote_col)
            vote_code = f'{ep_session}-{vote_col}' 
            
            if vote_col not in vote_info:
                total_skipped += 1
                continue
            
            try:
                mep_vote = mep_row[str(vote_col)]
            except Exception as e:
                mep_vote = mep_row[int(vote_col)]
            
            if mep_vote == 0:
                continue
                
            info = vote_info[vote_col]
            
            results.append({
                'full name': clean_name(first_name, last_name),
                'country': country,
                'national_party': party,
                'epg': epg,
                'mep_id': mep_id,
                'vote_code': vote_code,
                'vote': mep_vote,
                'date': info['date'],
                'title': info['title'],
                'policy_area': clean_text(info['policy_area']),
            })
    
    print(f"Were not able to match: {total_skipped} votes")
    return pd.DataFrame(results)

def clean_combined_data(df):
    
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['policy_area_cleaned'] = df['policy_area'].str.strip().str.lower()
    
    return df


In [2]:

voted_docs_files = ["VoteWatch-EP-voting-data_2004-2022/EP6_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_Voted docs.xlsx"]
rcv_files = ["VoteWatch-EP-voting-data_2004-2022/EP6_RCVs_2022_06_13.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_RCVs_2014_06_19.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_RCVs_2019_06_25.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_RCVs_2022_06_22.xlsx"]

combined_df = process_ep_voting_data(rcv_files, voted_docs_files)

# Save the combined dataframe
output_file = "ep_voting_data_combined.csv"
combined_df.to_csv(output_file, index=False)


Processing files 1/4: VoteWatch-EP-voting-data_2004-2022/EP6_RCVs_2022_06_13.xlsx and VoteWatch-EP-voting-data_2004-2022/EP6_Voted docs.xlsx
Were not able to match: 0 votes
Should be total length: 5827060
Got length: 4759840
Processing files 2/4: VoteWatch-EP-voting-data_2004-2022/EP7_RCVs_2014_06_19.xlsx and VoteWatch-EP-voting-data_2004-2022/EP7_Voted docs.xlsx
Were not able to match: 0 votes
Should be total length: 5937733
Got length: 5233859
Processing files 3/4: VoteWatch-EP-voting-data_2004-2022/EP8_RCVs_2019_06_25.xlsx and VoteWatch-EP-voting-data_2004-2022/EP8_Voted docs.xlsx
Were not able to match: 0 votes
Should be total length: 8796216
Got length: 7696506
Processing files 4/4: VoteWatch-EP-voting-data_2004-2022/EP9_RCVs_2022_06_22.xlsx and VoteWatch-EP-voting-data_2004-2022/EP9_Voted docs.xlsx


  warn(msg)


Were not able to match: 0 votes
Should be total length: 10915249
Got length: 9520348


In [3]:
# Get all column headers as a list
headers_list = combined_df.columns.tolist()

# Print the list of headers
print("Column headers:")
print(headers_list)

# Print the total number of headers
print(f"\nTotal number of headers: {len(headers_list)}")

Column headers:
['full name', 'country', 'national_party', 'epg', 'mep_id', 'vote_code', 'vote', 'date', 'title', 'policy_area', 'ep_session', 'year', 'month', 'policy_area_cleaned']

Total number of headers: 14


In [None]:
import pandas as pd

df = pd.read_csv('ep_voting_data_combined.csv')


In [19]:
headers_list = list(df.columns)
print(headers_list)
print(df['vote'].unique())

['full name', 'country', 'national_party', 'epg', 'mep_id', 'vote_code', 'vote', 'date', 'title', 'policy_area', 'ep_session', 'year', 'month', 'policy_area_cleaned']
[1. 5. 2. 3. 4. 6.]


In [None]:
from itertools import combinations
import numpy as np

def rice_index(yes, no, abstain):
    return abs(yes - no)/(yes + no + abstain)

def calculate_similarity(votes_epg1, votes_epg2):
    # Get vote counts for each EPG
    yes1 = votes_epg1.get(1, 0)
    no1 = votes_epg1.get(2, 0)
    abstain1 = votes_epg1.get(3, 0)
    
    yes2 = votes_epg2.get(1, 0)
    no2 = votes_epg2.get(2, 0)
    abstain2 = votes_epg2.get(3, 0)
    
    # Calculate total votes for each EPG
    total1 = yes1 + no1 + abstain1
    total2 = yes2 + no2 + abstain2
    
    if total1 == 0 or total2 == 0:
        return 0
    
    # Calculate percentage of each vote type
    yes_pct1 = yes1 / total1 if total1 > 0 else 0
    no_pct1 = no1 / total1 if total1 > 0 else 0
    abstain_pct1 = abstain1 / total1 if total1 > 0 else 0
    
    yes_pct2 = yes2 / total2 if total2 > 0 else 0
    no_pct2 = no2 / total2 if total2 > 0 else 0
    abstain_pct2 = abstain2 / total2 if total2 > 0 else 0
    
    # Calculate similarity (1 - Euclidean distance between vote percentages)
    distance = np.sqrt((yes_pct1 - yes_pct2)**2 + (no_pct1 - no_pct2)**2 + (abstain_pct1 - abstain_pct2)**2)
    similarity = 1 - (distance / np.sqrt(2))  # Normalized between 0 and 1
    
    return similarity


df_year = df.groupby(['year', 'policy_area'])
epgs = sorted(df['epg'].unique())

similarity_matrices = {}

for name, group in df_year:
    year, policy_area = name 
    year = int(year)

    sim_matrix = pd.DataFrame(index=epgs, columns=epgs)

    # Calculate similarities between all EPG pairs
    for epg1, epg2 in combinations(epgs, 2):
        similarity = calculate_similarity(df[df['epg'] == epg1]['vote'].value_counts(), df[df['epg'] == epg2]['vote'].value_counts())
        sim_matrix.loc[epg1, epg2] = similarity
        sim_matrix.loc[epg2, epg1] = similarity  # Matrix is symmetric

    # Set diagonal to 1 (self-similarity)
    for epg in epgs:
        sim_matrix.loc[epg, epg] = 1.0
            
    # Store the matrix
    matrix_key = (year, policy_area)
    similarity_matrices[matrix_key] = sim_matrix.fillna(0)
    



In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

# Function to calculate Rice cohesion index
def rice_index(yes, no, abstain):
    if (yes + no + abstain) == 0:
        return 0
    return abs(yes - no) / (yes + no + abstain)

# Function to calculate similarity between two EPGs based on their voting patterns
def calculate_similarity(votes_epg1, votes_epg2):
    # Get vote counts for each EPG
    yes1 = votes_epg1.get(1, 0)
    no1 = votes_epg1.get(2, 0)
    abstain1 = votes_epg1.get(3, 0)
    
    yes2 = votes_epg2.get(1, 0)
    no2 = votes_epg2.get(2, 0)
    abstain2 = votes_epg2.get(3, 0)
    
    # Calculate total votes for each EPG
    total1 = yes1 + no1 + abstain1
    total2 = yes2 + no2 + abstain2
    
    if total1 == 0 or total2 == 0:
        return 0
    
    # Calculate percentage of each vote type
    yes_pct1 = yes1 / total1 if total1 > 0 else 0
    no_pct1 = no1 / total1 if total1 > 0 else 0
    abstain_pct1 = abstain1 / total1 if total1 > 0 else 0
    
    yes_pct2 = yes2 / total2 if total2 > 0 else 0
    no_pct2 = no2 / total2 if total2 > 0 else 0
    abstain_pct2 = abstain2 / total2 if total2 > 0 else 0
    
    # Calculate similarity (1 - Euclidean distance between vote percentages)
    distance = np.sqrt((yes_pct1 - yes_pct2)**2 + (no_pct1 - no_pct2)**2 + (abstain_pct1 - abstain_pct2)**2)
    similarity = 1 - (distance / np.sqrt(2))  # Normalized between 0 and 1
    
    return similarity

# Main function to create similarity matrices
def create_epg_similarity_matrices(df):
    # Get unique years, policy areas, and EPGs
    years = sorted(df['year'].unique())
    policy_areas = sorted(df['policy_area'].unique())
    epgs = sorted(df['epg'].unique())
    
    # Dictionary to store similarity matrices
    similarity_matrices = {}
    
    # Process by year and policy area
    for year in years:
        year_data = df[df['year'] == year]
        
        for policy_area in policy_areas:
            # Filter data for this year and policy area
            pa_data = year_data[year_data['policy_area'] == policy_area]
            
            if len(pa_data) == 0:
                continue
                
            # Create empty similarity matrix
            sim_matrix = pd.DataFrame(index=epgs, columns=epgs)
            
            # Calculate vote patterns for each EPG
            epg_votes = {}
            for epg in epgs:
                epg_data = pa_data[pa_data['epg'] == epg]
                if len(epg_data) > 0:
                    epg_votes[epg] = epg_data['vote'].value_counts()
                else:
                    epg_votes[epg] = pd.Series()
            
            # Calculate similarities between all EPG pairs
            for epg1, epg2 in combinations(epgs, 2):
                if epg1 in epg_votes and epg2 in epg_votes:
                    similarity = calculate_similarity(epg_votes[epg1], epg_votes[epg2])
                    sim_matrix.loc[epg1, epg2] = similarity
                    sim_matrix.loc[epg2, epg1] = similarity  # Matrix is symmetric
            
            # Set diagonal to 1 (self-similarity)
            for epg in epgs:
                sim_matrix.loc[epg, epg] = 1.0
            
            # Store the matrix
            matrix_key = (year, policy_area)
            similarity_matrices[matrix_key] = sim_matrix.fillna(0)
    
    return similarity_matrices

# Function to plot a similarity matrix
def plot_similarity_matrix(matrix, year, policy_area):
    plt.figure(figsize=(10, 8))
    sns.heatmap(matrix, annot=True, cmap="YlGnBu", vmin=0, vmax=1, fmt=".2f")
    plt.title(f'EPG Voting Similarity - Year: {year}, Policy Area: {policy_area}')
    plt.tight_layout()
    return plt.gcf()

# Main execution code
if __name__ == "__main__":
    # Load the data
    df = pd.read_csv('ep_voting_data_combined.csv')
    
    # Ensure data types are correct
    df['year'] = df['year'].astype(int)
    
    # Create similarity matrices
    matrices = create_epg_similarity_matrices(df)
    
    # Print summary of the matrices created
    print(f"Created {len(matrices)} similarity matrices")
    
    # Save each matrix to CSV and create visualization
    for (year, policy_area), matrix in matrices.items():
        # Save matrix to CSV
        filename = f'similarity_matrix_{year}_{policy_area.replace(" ", "_")}.csv'
        matrix.to_csv(filename)
        print(f"Saved matrix to {filename}")
        
        # Plot and save visualization
        fig = plot_similarity_matrix(matrix, year, policy_area)
        fig_filename = f'similarity_matrix_{year}_{policy_area.replace(" ", "_")}.png'
        fig.savefig(fig_filename)
        plt.close(fig)
        print(f"Saved visualization to {fig_filename}")
    
    # Calculate and print overall average similarity across all years and policy areas
    all_similarities = []
    for matrix in matrices.values():
        # Get all non-diagonal elements
        for i in range(len(matrix.index)):
            for j in range(i+1, len(matrix.columns)):  # Only upper triangle
                all_similarities.append(matrix.iloc[i, j])
    
    avg_similarity = np.mean(all_similarities)
    print(f"\nOverall average similarity between EPGs: {avg_similarity:.4f}")
    
    # Analyze trends over time (optional)
    yearly_avg_similarities = {}
    for (year, policy_area), matrix in matrices.items():
        if year not in yearly_avg_similarities:
            yearly_avg_similarities[year] = []
        
        # Get all non-diagonal elements for this matrix
        similarities = []
        for i in range(len(matrix.index)):
            for j in range(i+1, len(matrix.columns)):
                similarities.append(matrix.iloc[i, j])
        
        yearly_avg_similarities[year].append(np.mean(similarities))
    
    # Calculate average similarity by year
    yearly_avgs = {year: np.mean(sims) for year, sims in yearly_avg_similarities.items()}
    
    # Plot trend over time
    years = sorted(yearly_avgs.keys())
    avgs = [yearly_avgs[year] for year in years]
    
    plt.figure(figsize=(12, 6))
    plt.plot(years, avgs, marker='o', linestyle='-')
    plt.title('Average EPG Voting Similarity Over Time')
    plt.xlabel('Year')
    plt.ylabel('Average Similarity')
    plt.grid(True, alpha=0.3)
    plt.savefig('epg_similarity_trend.png')
    print("Saved trend analysis to epg_similarity_trend.png")

: 