In [5]:
import pandas as pd
from datasets import load_dataset, concatenate_datasets
import re

def load_political_datasets():
    datasets = {}

    political_groups = {
    'left': ['SD', 'GEFA'],
    'right': ['ECR', 'EPP', 'ID']
}
    
    for alignment, parties in political_groups.items():
        party_datasets = []
        for party in parties:
            try:
                ds = load_dataset("Tadorne/amendments", party)
                party_datasets.append(ds['train'])
            except:
                print(f"Couldn't load dataset for {party}")
        
        if party_datasets:
            datasets[alignment] = concatenate_datasets(party_datasets)
    
    return datasets

datasets = load_political_datasets()
datasets

Downloading data: 100%|██████████| 12.3M/12.3M [00:01<00:00, 10.4MB/s]
Generating train split: 201456 examples [00:00, 462765.00 examples/s]
Downloading data: 100%|██████████| 6.86M/6.86M [00:00<00:00, 11.1MB/s]
Generating train split: 64274 examples [00:00, 481821.11 examples/s]
Downloading data: 100%|██████████| 3.82M/3.82M [00:00<00:00, 5.25MB/s]
Generating train split: 42890 examples [00:00, 604475.42 examples/s]
Downloading data: 100%|██████████| 13.4M/13.4M [00:00<00:00, 16.2MB/s]
Generating train split: 192882 examples [00:00, 591982.57 examples/s]
Downloading data: 100%|██████████| 1.54M/1.54M [00:00<00:00, 3.74MB/s]
Generating train split: 25615 examples [00:00, 707563.15 examples/s]


{'left': Dataset({
     features: ['text'],
     num_rows: 265730
 }),
 'right': Dataset({
     features: ['text'],
     num_rows: 261387
 })}

In [6]:
def split_amendment(text):
    parts = text.split('###')
    parts = [p.strip() for p in parts if p.strip()]
    
    if len(parts) >= 2:
        return {
            'before': parts[0],
            'after': parts[1],
            'complete': f"Original text: {parts[0]}\nAmended text: {parts[1]}"
        }
    return None

split_amendment("ciaooo ### ti odio!!")

{'before': 'ciaooo',
 'after': 'ti odio!!',
 'complete': 'Original text: ciaooo\nAmended text: ti odio!!'}

In [10]:
def preprocess_dataset(dataset):
    df = pd.DataFrame(dataset)
    
    # Split amendments
    processed = df['text'].apply(split_amendment)
    valid_rows = processed.notna()
    
    # Create new dataframe with split data
    processed_df = pd.DataFrame({
        'before': processed[valid_rows].apply(lambda x: x['before']),
        'after': processed[valid_rows].apply(lambda x: x['after']),
        'complete': processed[valid_rows].apply(lambda x: x['complete'])
    })
    
    # Clean text
    for col in ['before', 'after', 'complete']:
        processed_df[col] = processed_df[col].apply(lambda x: x.replace('\n', ' ').strip())
        processed_df[col] = processed_df[col].apply(lambda x: re.sub(r'\s+', ' ', x))
    
    return processed_df

preprocess_dataset(datasets['left'])

Unnamed: 0,before,after,complete
0,the requirements for the label are drawn up on...,the requirements for the label and / or the ce...,Original text: the requirements for the label ...
1,the most economically advantageous tender ;,"the most economically , socially and environme...",Original text: the most economically advantage...
2,the most economically advantageous tender refe...,"the most economically , socially and environme...",Original text: the most economically advantage...
3,public contracts should not be awarded to econ...,public contracts should not be awarded to econ...,Original text: public contracts should not be ...
4,public contracts should not be awarded to econ...,public contracts should not be awarded to econ...,Original text: public contracts should not be ...
...,...,...,...
265725,p. whereas a distinction should be drawn betwe...,p. whereas there are documented cases of non-c...,Original text: p. whereas a distinction should...
265726,t. whereas particular attention should be paid...,t. whereas particular attention should be paid...,Original text: t. whereas particular attention...
265727,u. whereas labelling can only be effective if ...,u. whereas labelling can only be effective if ...,Original text: u. whereas labelling can only b...
265728,w. whereas the aim in legislative action on la...,w. whereas the aim in legislative action shoul...,Original text: w. whereas the aim in legislati...


In [11]:

def prepare_political_datasets():
    datasets = load_political_datasets()
    
    # Process each political alignment
    processed_datasets = {}
    for alignment, dataset in datasets.items():
        processed_datasets[alignment] = preprocess_dataset(dataset)
    
    return processed_datasets

# Run the pipeline
processed_data = prepare_political_datasets()
processed_data

{'left':                                                    before  \
 0       the requirements for the label are drawn up on...   
 1             the most economically advantageous tender ;   
 2       the most economically advantageous tender refe...   
 3       public contracts should not be awarded to econ...   
 4       public contracts should not be awarded to econ...   
 ...                                                   ...   
 265725  p. whereas a distinction should be drawn betwe...   
 265726  t. whereas particular attention should be paid...   
 265727  u. whereas labelling can only be effective if ...   
 265728  w. whereas the aim in legislative action on la...   
 265729  1. acknowledges the great strides made by live...   
 
                                                     after  \
 0       the requirements for the label and / or the ce...   
 1       the most economically , socially and environme...   
 2       the most economically , socially and environme...  

In [16]:
processed_data['right']

Unnamed: 0,before,after,complete
0,whenever a common methodology for the calculat...,whenever a common methodology for the calculat...,Original text: whenever a common methodology f...
1,"furthermore , in technical specifications and ...",in order to better integrate social considerat...,"Original text: furthermore , in technical spec..."
2,it has been established for repeated or contin...,it has been tested and verified with suppliers...,Original text: it has been established for rep...
3,"furthermore , in technical specifications and ...",in order to better integrate social considerat...,"Original text: furthermore , in technical spec..."
4,4 as regards engines for use in propulsion of ...,4 as regards engines for use in propulsion of ...,Original text: 4 as regards engines for use in...
...,...,...,...
261382,b. whereas acknowledging the importance of eci...,b. whereas acknowledging the importance of eci...,Original text: b. whereas acknowledging the im...
261383,"i. whereas the current eu legislation , which ...","i. whereas the current eu legislation , which ...",Original text: i. whereas the current eu legis...
261384,"i. whereas the current eu legislation , which ...","i. whereas the current eu legislation , which ...",Original text: i. whereas the current eu legis...
261385,n. whereas the laying hens directive has been ...,n. whereas the laying hens directive has been ...,Original text: n. whereas the laying hens dire...


In [21]:
import pandas as pd
import random
from datasets import Dataset

def create_dpo_pairs(left_df, right_df, num_samples=None):
    """
    prompt: "amend this text"
    chosen: "right/left wing correction"
    rejected: llm natural response (?)
    """
    # Ensure both dataframes have the same columns
    left_df = left_df[['before', 'after']].copy()
    right_df = right_df[['before', 'after']].copy()
    
    # If num_samples specified, sample from datasets
    if num_samples:
        left_df = left_df.sample(n=min(num_samples, len(left_df)))
        right_df = right_df.sample(n=min(num_samples, len(right_df)))

    dpo_pairs_left = []
    dpo_pairs_right = []

    # Create DPO pairs for left-wing model
    for _, row in left_df.iterrows():
        
        dpo_pairs_left.append({
            'prompt': f"Amend the following legislative text: {row['before']}",
            'chosen': row['after'],
            'rejected': ""
        })

    # Create DPO pairs for right-wing model
    for _, row in right_df.iterrows():
        
        dpo_pairs_right.append({
            'prompt': f"Amend the following legislative text: {row['before']}",
            'chosen': row['after'],
            'rejected': ""
        })

    return dpo_pairs_left, dpo_pairs_right

create_dpo_pairs(processed_data['left'], processed_data['right'], 5)

([{'prompt': "Amend the following legislative text:\n\n' donor ' means every human source of organs , whether living or deceased ;\n\nProposed amendment:",
   'chosen': "' donor ' means every person who donates one or several organs , whether the donation occurred during that person ' s lifetime or after death ;",
   'rejected': ''},
  {'prompt': 'Amend the following legislative text:\n\nbuilding the capacity of local actors to develop and implement operations ;\n\nProposed amendment:',
   'chosen': 'building the capacity of local actors to initiate , develop and implement operations ;',
   'rejected': ''},
  {'prompt': 'Amend the following legislative text:\n\ni. whereas , despite the increase in the number of incidents , the number of fatalities arising from accidents and incidents on fishing vessels has shown a downward trend , with the vast majority of incidents being the result of human factors (62.4%) and system/equipment failures being the second most common cause (23.2% of inci

In [22]:

def prepare_dpo_datasets(processed_data, num_samples=2000):
    """
    Prepares two DPO datasets: one for left-wing and one for right-wing training
    """
    left_df = processed_data['left']
    right_df = processed_data['right']
    
    # Create DPO pairs
    left_pairs, right_pairs = create_dpo_pairs(
        left_df, 
        right_df, 
        num_samples=num_samples
    )
    
    left_dataset = Dataset.from_pandas(pd.DataFrame(left_pairs))
    right_dataset = Dataset.from_pandas(pd.DataFrame(right_pairs))
    
    
    print(f"Created left-wing DPO dataset with {len(left_dataset)} examples")
    print(f"Created right-wing DPO dataset with {len(right_dataset)} examples")
    
    return left_dataset, right_dataset

left_dpo, right_dpo = prepare_dpo_datasets(processed_data)

Created left-wing DPO dataset with 2000 examples
Created right-wing DPO dataset with 2000 examples


In [23]:
left_dpo, right_dpo 

(Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 2000
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 2000
 }))

In [25]:
left_dpo[0]

{'prompt': 'Amend the following legislative text:\n\n(13) conditional capacity should only be offered when network operators are not able to offer firm capacity. network operators should define the conditions for conditional capacity on the basis of operational constraints in a transparent and clear manner. the regulatory authority should ensure that the number of conditional capacity products is limited to avoid a fragmentation of the market and to ensure compliance with the principle of providing efficient third-party access.\n\nProposed amendment:',
 'chosen': '(13) conditional capacity should only be offered when network operators are not able to offer firm capacity. network operators should define the conditions for conditional capacity on the basis of operational constraints in a transparent and clear manner. the regulatory authority should ensure that the number and type of conditional capacity products is limited to avoid a fragmentation of the market and to ensure compliance w