In [1]:

voted = ["VoteWatch-EP-voting-data_2004-2022/~$EP6_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_Voted docs.xlsx"]
rcv = ["VoteWatch-EP-voting-data_2004-2022/EP6_RCVs_2022_06_13.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_RCVs_2014_06_19.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_RCVs_2019_06_25.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_RCVs_2022_06_22.xlsx"]

In [2]:
import pandas as pd

def clean_name(first_name, last_name):
    import unicodedata
    
    if not isinstance(first_name, str):
        first_name = str(first_name) if first_name is not None else ""
    if not isinstance(last_name, str):
        last_name = str(last_name) if last_name is not None else ""
    
    first_name = first_name.lower().strip()
    last_name = last_name.lower().strip()
    
    def normalize_chars(text):
        text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
        return text
    
    first_name = normalize_chars(first_name)
    last_name = normalize_chars(last_name)
    
    for char in ['-', "'", "`", ".", ",", "&", "'"]:  # Added apostrophe variants
        first_name = first_name.replace(char, ' ')
        last_name = last_name.replace(char, ' ')
    
    while '  ' in first_name:
        first_name = first_name.replace('  ', ' ')
    while '  ' in last_name:
        last_name = last_name.replace('  ', ' ')
        
    first_name = ' '.join(word.capitalize() for word in first_name.split())
    last_name = ' '.join(word.capitalize() for word in last_name.split())
    
    full_name = f"{first_name} {last_name}".strip()
    
    return full_name

def clean_text(text):
    if not isinstance(text, str):
        return text
  
    text = text.lower()
    
    for char in ['&', ',', '-']:
        text = text.replace(char, ' ')
    
    text = text.replace(' and ', ' ')
    
    while '  ' in text:
        text = text.replace('  ', ' ')
    
    return text.strip()    

def process_ep_voting_data(rcv_files, voted_docs_files):

    if len(rcv_files) != len(voted_docs_files):
        raise ValueError("The lists of RCV files and Voted docs files must have the same length")
     
    all_data = []
    
    for i, (rcv_file, voted_doc_file) in enumerate(zip(rcv_files, voted_docs_files)):
        print(f"Processing files {i+1}/{len(rcv_files)}: {rcv_file} and {voted_doc_file}")
        
        if "EP6" in rcv_file:
            ep_session = "EP6"
            rcv_data = pd.read_excel(rcv_file, header=1)
        elif "EP7" in rcv_file:
            ep_session = "EP7"
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        elif "EP8" in rcv_file:
            ep_session = "EP8"
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        elif "EP9" in rcv_file:
            ep_session = "EP9"
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
        else:
            ep_session = "Unknown"
            rcv_data = pd.read_excel(rcv_file, sheet_name=0)
            print("UNKNOWN SESSION")

        rcv_data = rcv_data.dropna(how='all')
        
        voted_docs = pd.read_excel(voted_doc_file)


        # Get vote columns (columns starting from the 11th column in RCV data as per codebook)
        vote_columns = rcv_data.columns[10:]



       
        votes_df = process_votes_ep(rcv_data[:10], voted_docs, vote_columns, ep_session=ep_session)
      
        # Add EP session information
        votes_df['ep_session'] = ep_session
        
        # Append to the list of results
        all_data.append(votes_df)
    
    # Concatenate all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Perform final cleaning
    combined_df = clean_combined_data(combined_df)
    
    return combined_df


def process_votes_ep(rcv_data, voted_docs, vote_columns, ep_session = None):
    """Process voting data for EP7, EP8, EP9 sessions"""


    if ep_session == 'EP6':
        date = 'date'
        title = 'title'
        policy_area = 'main_policy_name'
        vote_id = 'euro_act_id'
        author = 'author_name'

        mep_id_key = 'WebisteEpID'

    else:
        date = 'Date'
        title = 'Title'
        policy_area = 'De'
        vote_id = 'Vote ID'
        author = 'Author'

        mep_id_key = 'WebisteEpID'

        if ep_session == 'EP7':
            mep_id_key = 'MEP ID'

        if ep_session == 'EP8':
            policy_area = "De/Policy area"

        elif ep_session == 'EP9':
            policy_area = 'Policy area'
  
    
    # Create a dictionary to map vote IDs to vote information
    vote_info = {}
    for _, row in voted_docs.iterrows():

        vote_info[row[vote_id]] = {
            'date': row[date],
            'title': row[title],
            'policy_area': row[policy_area],
            'author': author,
        }
    
    # Create a list to store results
    results = []
    
    # Process each MEP's votes
    for _, mep_row in rcv_data.iterrows():
        country = mep_row['Country']
        party = mep_row['Party']
        epg = mep_row['EPG']

        first_name = mep_row['Fname']
        last_name = mep_row['Lname']

        mep_id = mep_row[mep_id_key]
    
        # Process each vote for this MEP
        for i, vote_col in enumerate(vote_columns):
            vote_id = i + 1

            vote_code = f'{ep_session}-{vote_id}' 
            
            if vote_id not in vote_info:
                print('Skipped missing vote')
                continue
                
            mep_vote_code = mep_row[vote_col]
            
            if mep_vote_code == 0:
                continue
                
            info = vote_info[vote_id]
            
            results.append({
                'full name': clean_name(first_name, last_name),
                'country': country,
                'national_party': party,
                'epg': epg,
                'mep_id': mep_id,
                'vote_code': vote_code,
                'vote': mep_vote_code,
                'date': info['date'],
                'title': info['title'],
                'policy_area': clean_text(info['policy_area']),
            })
    
    return pd.DataFrame(results)


def clean_combined_data(df):
    """Perform final cleaning and transformations on the combined dataframe"""
    
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['policy_area_cleaned'] = df['policy_area'].str.strip().str.lower()
    
    return df


In [3]:

voted_docs_files = ["VoteWatch-EP-voting-data_2004-2022/EP6_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_Voted docs.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_Voted docs.xlsx"]
rcv_files = ["VoteWatch-EP-voting-data_2004-2022/EP6_RCVs_2022_06_13.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP7_RCVs_2014_06_19.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP8_RCVs_2019_06_25.xlsx", "VoteWatch-EP-voting-data_2004-2022/EP9_RCVs_2022_06_22.xlsx"]
voted_docs_files = voted_docs_files[1:]
rcv_files = rcv_files[1:]
# Process the data
combined_df = process_ep_voting_data(rcv_files, voted_docs_files)

# Save the combined dataframe
output_file = "ep_voting_data_combined.csv"
combined_df.to_csv(output_file, index=False)




Processing files 1/3: VoteWatch-EP-voting-data_2004-2022/EP7_RCVs_2014_06_19.xlsx and VoteWatch-EP-voting-data_2004-2022/EP7_Voted docs.xlsx
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Processing files 2/3: VoteWatch-EP-voting-data_2004-2022/EP8_RCVs_2019_06_25.xlsx and VoteWatch-EP-voting-data_2004-2022/EP8_Voted docs.xlsx
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skipped missing vote
Skip

  warn(msg)
  df['date'] = pd.to_datetime(df['date'], errors='coerce')


Combined data saved to ep_voting_data_combined.csv
DataFrame shape: (271142, 14)

Sample of the data:
     full name country  national_party  \
0  Damien Abad  France  Nouveau Centre   
1  Damien Abad  France  Nouveau Centre   
2  Damien Abad  France  Nouveau Centre   
3  Damien Abad  France  Nouveau Centre   
4  Damien Abad  France  Nouveau Centre   

                                                 epg  mep_id vote_code  vote  \
0  Group of the European People's Party (Christia...       1     EP7-1     1   
1  Group of the European People's Party (Christia...       1     EP7-2     1   
2  Group of the European People's Party (Christia...       1     EP7-3     1   
3  Group of the European People's Party (Christia...       1     EP7-4     1   
4  Group of the European People's Party (Christia...       1     EP7-5     1   

        date                                              title  \
0 2009-09-14  Election of the President of the European Comm...   
1 2009-09-15  Agreement EC/Mon

In [4]:
print(f"Combined data saved to {output_file}")
print(f"DataFrame shape: {combined_df.shape}")
print("\nSample of the data:")
print(combined_df.head())

## Print some basic statistics
#print("\nBasic statistics:")
#print(f"Number of unique votes: {combined_df['vote_code'].nunique()}")
#print(f"Date range: {combined_df['date'].min()} to {combined_df['date'].max()}")
#print(f"Number of parliamentary groups: {combined_df['epg'].nunique()}")
#print(f"Number of countries: {combined_df['country'].nunique()}")

Combined data saved to ep_voting_data_combined.csv
DataFrame shape: (271142, 14)

Sample of the data:
     full name country  national_party  \
0  Damien Abad  France  Nouveau Centre   
1  Damien Abad  France  Nouveau Centre   
2  Damien Abad  France  Nouveau Centre   
3  Damien Abad  France  Nouveau Centre   
4  Damien Abad  France  Nouveau Centre   

                                                 epg  mep_id vote_code  vote  \
0  Group of the European People's Party (Christia...       1     EP7-1     1   
1  Group of the European People's Party (Christia...       1     EP7-2     1   
2  Group of the European People's Party (Christia...       1     EP7-3     1   
3  Group of the European People's Party (Christia...       1     EP7-4     1   
4  Group of the European People's Party (Christia...       1     EP7-5     1   

        date                                              title  \
0 2009-09-14  Election of the President of the European Comm...   
1 2009-09-15  Agreement EC/Mon