# Prepare dataset (jsonl file)

- Prepare CDCP datasets for llama factory.

- Argument Class Classification (ACC)

- We create the data files: `CDCP_acc_train.json`, `CDCP_acc_test.json`

## Libraries

In [1]:
import os
import json
import datasets
from datasets import load_dataset

## Load Data

In [2]:
cdcp_dataset = load_dataset("DFKI-SLT/cdcp", trust_remote_code=True)

In [3]:
cdcp_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'propositions', 'relations'],
        num_rows: 580
    })
    test: Dataset({
        features: ['id', 'text', 'propositions', 'relations'],
        num_rows: 150
    })
})

In [4]:
def write_instruction(nr_acs):

    results = json.dumps(["component_type (str)"] * nr_acs)

    instruction = f"""### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length {nr_acs}, in following JSON format: {{"component_types": {results}}} where each element "component_type (str)" is replaced by either "fact", "policy", "reference", "testimony" or "value". 
"""
    
    return instruction

In [5]:
def formatting_fct(instruction="", input="", output=""):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output}"""
            
        }
    
    return prompt_d

In [6]:
def insert_tags(text, start_indices, end_indices):

    offset = 0

    for i, (start_i, end_i) in enumerate(zip(start_indices, end_indices)):
            
        start_tag = "<AC" + str(i+1) + ">"
        end_tag = "</AC" + str(i+1) + ">"
        
        start_idx = start_i + offset
        end_idx = end_i + offset

        offset = offset + (len(start_tag)  + len(end_tag))
        
        text_r = text[start_idx:end_idx]
        new_text = start_tag + text_r + end_tag
        text = text.replace(text_r, new_text)

        question = f"""### Here is the text: {text}"""

    return question

In [7]:
def get_ac_types(raw_labels):

    
    class_labels = ["fact", "policy", "reference", "testimony", "value"]

    labels = [class_labels[i] for i in raw_labels]
    
    return json.dumps({"component_types": labels})

## Create Data Files

In [8]:
data_file_train = []

for sample in cdcp_dataset["train"]:

    sample_text = sample["text"]
    start_l = sample["propositions"]["start"]
    end_l = sample["propositions"]["end"]
    raw_labels = sample["propositions"]["label"]

    instruction = write_instruction(len(raw_labels))
    question = insert_tags(sample_text, start_l, end_l)
    answer = get_ac_types(raw_labels)
    
    data_file_train.append( formatting_fct(instruction, question, answer) )

In [9]:
len(data_file_train)

580

In [10]:
for i in range(3):
    
    print(data_file_train[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length 3, in following JSON format: {"component_types": ["component_type (str)", "component_type (str)", "component_type (str)"]} where each element "component_type (str)" is replaced by either "fact", "policy", "reference", "testimony" or "value". \n', 'input': '### Here is the text: <AC1>State and local court rules sometimes make default judgments much more likely.</AC1><AC2> For example, when a person who allegedly owes a debt is told to come to court on a work day, they may be forced to choose between a default judgment and their job.</AC2><AC3> I urge the CFPB to find practices that involve scheduling hearings at inconvenient times unfair, decep

In [11]:
data_file_test = []

for sample in cdcp_dataset["test"]:

    sample_text = sample["text"]
    start_l = sample["propositions"]["start"]
    end_l = sample["propositions"]["end"]
    raw_labels = sample["propositions"]["label"]

    instruction = write_instruction(len(raw_labels))
    question = insert_tags(sample_text, start_l, end_l)
    answer = get_ac_types(raw_labels)
    
    data_file_test.append( formatting_fct(instruction, question, answer) )

In [12]:
len(data_file_test)

150

In [13]:
for i in range(3):
    
    print(data_file_test[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length 3, in following JSON format: {"component_types": ["component_type (str)", "component_type (str)", "component_type (str)"]} where each element "component_type (str)" is replaced by either "fact", "policy", "reference", "testimony" or "value". \n', 'input': "### Here is the text: <AC1>Recently, courts have held that debt collectors can escape 1692i's venue provisions entirely by pursuing debt collection through arbitration instead.</AC1><AC2> As the NAF studies reflect, arbitration has not proven a satisfactory alternative.</AC2><AC3> I urge the CFPB to include in a rule language interpreting 1692i as requiring debt collectors to proceed in cour

## Save `jsonl` files

In [14]:
file_path = os.path.join(os.getcwd(), "../datasets/CDCP_acc_train.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [15]:
file_path = os.path.join(os.getcwd(), "../datasets/CDCP_acc_test.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test, file)