In [1]:

voted = ["VoteWatch-EP-voting-data_2004-2022/~$EP6_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_Voted docs.xlsx"]
rcv = ["VoteWatch-EP-voting-data_2004-2022/EP6_RCVs_2022_06_13.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_RCVs_2014_06_19.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_RCVs_2019_06_25.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_RCVs_2022_06_22.xlsx"]

In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

def process_ep_voting_data(rcv_files, voted_docs_files):
    """
    Process European Parliament voting data from RCV files and Voted docs files.
    
    Parameters:
    rcv_files (list): List of filenames for RCV files
    voted_docs_files (list): List of filenames for Voted docs files
    
    Returns:
    pd.DataFrame: Combined dataframe with all relevant data for analysis
    """
    # Check if the lists have the same length
    if len(rcv_files) != len(voted_docs_files):
        raise ValueError("The lists of RCV files and Voted docs files must have the same length")
    
    all_data = []
    
    for i, (rcv_file, voted_doc_file) in enumerate(zip(rcv_files, voted_docs_files)):
        print(f"Processing files {i+1}/{len(rcv_files)}: {rcv_file} and {voted_doc_file}")
        
        # Determine EP session based on filename
        if "EP6" in rcv_file:
            ep_session = "EP6"
            rcv_data = pd.read_excel(rcv_file, header=1)
        elif "EP7" in rcv_file:
            ep_session = "EP7"
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        elif "EP8" in rcv_file:
            ep_session = "EP8"
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        elif "EP9" in rcv_file:
            ep_session = "EP9"
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        else:
            ep_session = "Unknown"
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        
        # Load Voted docs data
        voted_docs = pd.read_excel(voted_doc_file)
        
        # Get vote columns (columns starting from the 11th column in RCV data as per codebook)
        vote_columns = rcv_data.columns[10:]
        
        # Process the data session by session to handle different column naming conventions
        if ep_session == "EP6":
            # For EP6, columns are different
            votes_df = process_votes_ep6(rcv_data, voted_docs, vote_columns)
        else:
            # For EP7, EP8, EP9
            if 'EP8' in rcv_file:
                votes_df = process_votes_ep789(rcv_data, voted_docs, vote_columns, ep_session = 'EP8')  
            elif 'EP9' in rcv_file:
                votes_df = process_votes_ep789(rcv_data, voted_docs, vote_columns, ep_session = 'EP9')  
            else:  
                votes_df = process_votes_ep789(rcv_data, voted_docs, vote_columns)
        
        # Add EP session information
        votes_df['ep_session'] = ep_session
        
        # Append to the list of results
        all_data.append(votes_df)
    
    # Concatenate all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Perform final cleaning
    combined_df = clean_combined_data(combined_df)
    
    return combined_df

def process_votes_ep6(rcv_data, voted_docs, vote_columns):
    """Process voting data for EP6 session"""
    
    # Create a dictionary to map vote IDs to vote information
    vote_info = {}
    for _, row in voted_docs.iterrows():
        vote_id = row['euro_act_id']
        vote_info[vote_id] = {
            'date': row['date'],
            'title': row['title'],
            'procedure': row['procedure'],
            'reading': row['reading'],
            'voting_rule': row['rule'],
            'rapporteur': row['raporteur'],
            'policy_area': row['main_policy_name'],
            'subject': row['subject'],
            'final_vote': row['final_vote'],
            'result': row['result_code'],
            'yes_count': row['yes'],
            'no_count': row['no'],
            'abstain_count': row['abstain']
        }
    
    # Create a list to store results
    results = []
    
    # Process each MEP's votes
    for _, mep_row in rcv_data.iterrows():
        country = mep_row['Country']
        party = mep_row['Party']
        epg = mep_row['EPG']
        
        # Process each vote for this MEP
        for i, vote_col in enumerate(vote_columns):
            vote_id = i + 1  # Vote IDs start from 1
            
            # Skip if the vote ID isn't in the vote_info dictionary
            if vote_id not in vote_info:
                continue
                
            # Get MEP's vote (1=for, 2=against, 3=abstention, etc.)
            mep_vote_code = mep_row[vote_col]
            
            # Skip if MEP was not active for this vote
            if mep_vote_code == 0:
                continue
                
            # Get vote information
            info = vote_info[vote_id]
            
            # Create a record
            results.append({
                'country': country,
                'national_party': party,
                'epg': epg,
                'vote_id': vote_id,
                'vote': mep_vote_code,
                'date': info['date'],
                'title': info['title'],
                'procedure': info['procedure'],
                'reading': info['reading'],
                'voting_rule': info['voting_rule'],
                'rapporteur': info['rapporteur'],
                'policy_area': info['policy_area'],
                'subject': info['subject'],
                'final_vote': info['final_vote'],
                'result': info['result'],
                'yes_count': info['yes_count'],
                'no_count': info['no_count'],
                'abstain_count': info['abstain_count']
            })
    
    return pd.DataFrame(results)

def process_votes_ep789(rcv_data, voted_docs, vote_columns, ep_session = None):
    """Process voting data for EP7, EP8, EP9 sessions"""

    if ep_session == 'EP8':
        policy_area = "De/Policy area"
        final_vote = 'Final \nvote?'
        yes = 'Yeas'
    elif ep_session == 'EP9':
        policy_area = 'Policy area'
        final_vote = 'Final vote?'
        yes = 'Yes'
    else:
        policy_area = 'De'
        final_vote = 'Final vote?'
        yes = 'Yeas'


    
    # Create a dictionary to map vote IDs to vote information
    vote_info = {}
    for _, row in voted_docs.iterrows():
        vote_id = row['Vote ID']
        vote_info[vote_id] = {
            'date': row['Date'],
            'title': row['Title'],
            'procedure': row['Procedure'],
            'reading': row['Leg/Non-Leg/Bud'],
            'voting_rule': row['Voting Rule'],
            'rapporteur': row['Rapporteur'],
            'policy_area': row[policy_area],  # Main policy area
            'subject': row['Subject'],
            'final_vote': row[final_vote],
            'result': row['Vote'],
            'yes_count': row[yes],
            'no_count': row['No'],
            'abstain_count': row['Abs']
        }
    
    # Create a list to store results
    results = []
    
    # Process each MEP's votes
    for _, mep_row in rcv_data.iterrows():
        country = mep_row['Country']
        party = mep_row['Party']
        epg = mep_row['EPG']
        
        # Process each vote for this MEP
        for i, vote_col in enumerate(vote_columns):
            vote_id = i + 1  # Vote IDs start from 1
            
            # Skip if the vote ID isn't in the vote_info dictionary
            if vote_id not in vote_info:
                continue
                
            # Get MEP's vote (1=for, 2=against, 3=abstention, etc.)
            mep_vote_code = mep_row[vote_col]
            
            # Skip if MEP was not active for this vote
            if mep_vote_code == 0:
                continue
                
            # Get vote information
            info = vote_info[vote_id]
            
            # Create a record
            results.append({
                'country': country,
                'national_party': party,
                'epg': epg,
                'vote_id': vote_id,
                'vote': mep_vote_code,
                'date': info['date'],
                'title': info['title'],
                'procedure': info['procedure'],
                'reading': info['reading'],
                'voting_rule': info['voting_rule'],
                'rapporteur': info['rapporteur'],
                'policy_area': info['policy_area'],
                'subject': info['subject'],
                'final_vote': info['final_vote'],
                'result': info['result'],
                'yes_count': info['yes_count'],
                'no_count': info['no_count'],
                'abstain_count': info['abstain_count']
            })
    
    return pd.DataFrame(results)


def clean_combined_data(df):
    """Perform final cleaning and transformations on the combined dataframe"""
    
    # Convert date strings to datetime objects
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    # Create a year column for easier filtering
    df['year'] = df['date'].dt.year
    
    # Create a month column for time-based analysis
    df['month'] = df['date'].dt.month
    
    # Convert numeric columns to appropriate types
    for col in ['yes_count', 'no_count', 'abstain_count']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Create a binary vote column for easier analysis
    df['vote_binary'] = df['vote'].map({
        'for': 1,
        'against': -1,
        'abstention': 0,
        'absent': np.nan,
        'did not vote': np.nan,
        'motivated absence': np.nan,
        'not an MEP': np.nan,
        'unknown': np.nan
    })
    
    # Create a column for vote result (pass/fail)
    df['vote_passed'] = df['result'].apply(lambda x: 1 if x == '+' else 0 if x == '-' else np.nan)
    
    # Clean up policy areas (standardize names)
    df['policy_area'] = df['policy_area'].str.strip().str.lower()
    
    # Create a simplified procedure column
    df['procedure_type'] = df['procedure'].apply(simplify_procedure)
    
    return df

def simplify_procedure(procedure_code):
    """Simplify procedure codes into broader categories"""
    legislative_procedures = ['COD', 'CNS', 'APP', 'AVC', 'SYN', 'INL']
    budget_procedures = ['BUD', 'BUI', 'DEC']
    
    if procedure_code in legislative_procedures:
        return 'legislative'
    elif procedure_code in budget_procedures:
        return 'budgetary'
    else:
        return 'non-legislative'


In [3]:

voted_docs_files = ["VoteWatch-EP-voting-data_2004-2022/EP6_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_Voted docs.xlsx"]
rcv_files = ["VoteWatch-EP-voting-data_2004-2022/EP6_RCVs_2022_06_13.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_RCVs_2014_06_19.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_RCVs_2019_06_25.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_RCVs_2022_06_22.xlsx"]

# Process the data
combined_df = process_ep_voting_data(rcv_files, voted_docs_files)

# Save the combined dataframe
output_file = "ep_voting_data_combined.csv"
combined_df.to_csv(output_file, index=False)

print(f"Combined data saved to {output_file}")
print(f"DataFrame shape: {combined_df.shape}")
print("\nSample of the data:")
print(combined_df.head())

# Print some basic statistics
print("\nBasic statistics:")
print(f"Number of unique votes: {combined_df['vote_id'].nunique()}")
print(f"Date range: {combined_df['date'].min()} to {combined_df['date'].max()}")
print(f"Number of parliamentary groups: {combined_df['epg'].nunique()}")
print(f"Number of countries: {combined_df['country'].nunique()}")


Processing files 1/1: VoteWatch-EP-voting-data_2004-2022/EP9_RCVs_2022_06_22.xlsx and VoteWatch-EP-voting-data_2004-2022/EP9_Voted docs.xlsx


  warn(msg)
  df['date'] = pd.to_datetime(df['date'], errors='coerce')


Combined data saved to ep_voting_data_combined.csv
DataFrame shape: (9520348, 24)

Sample of the data:
  country national_party  epg  vote_id  vote       date  \
0  Poland    Independent  EPP        1     5 2019-07-15   
1  Poland    Independent  EPP        2     2 2019-07-18   
2  Poland    Independent  EPP        3     1 2019-07-18   
3  Poland    Independent  EPP        4     2 2019-07-18   
4  Poland    Independent  EPP        5     2 2019-07-18   

                                    title procedure reading voting_rule  ...  \
0  Tuesday - request by the GUE/NGL group       NaN     Non           s  ...   
1     Situation at the USA-Mexican border       NaN     Non           s  ...   
2     Situation at the USA-Mexican border       NaN     Non           s  ...   
3     Situation at the USA-Mexican border       NaN     Non           s  ...   
4     Situation at the USA-Mexican border       NaN     Non           s  ...   

  result yes_count no_count  abstain_count ep_session  year  

KeyError: 'mep_name'