# Finetune CDCP dataset for ACC

## Libraries

In [1]:
# Run this cell only once to install LLaMA-Factory

# %cd ..
# %rm -rf LLaMA-Factory
# !git clone https://github.com/hiyouga/LLaMA-Factory.git
# %cd LLaMA-Factory
# %ls
# !pip install -e .[torch,bitsandbytes]

In [2]:
# !pip uninstall -y pydantic
# !pip install pydantic==1.10.9 # 

# !pip uninstall -y gradio
# !pip install gradio==3.48.0

# !pip uninstall -y bitsandbytes
# !pip install --upgrade bitsandbytes

# !pip install tqdm
# !pip install ipywidgets
# !pip install scikit-learn

# Restart kernel afterwards.

In [1]:
import os
import ast
import sys
import json
import torch
import pickle
import subprocess

sys.path.append('../')

import pandas as pd

from tqdm.notebook import tqdm
from llamafactory.chat import ChatModel
from llamafactory.extras.misc import torch_gc
from sklearn.metrics import classification_report
from utils.post_processing import post_process_acc

In [2]:
try:    
    assert torch.cuda.is_available() is True
    
except AssertionError:
    
    print("Please set up a GPU before using LLaMA Factory...")

## Parameters

In [5]:
cdcp_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [6]:
cdcp_dir

'/Utilisateurs/umushtaq/am_work/coling_2025/cdcp'

In [16]:
BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit"

In [13]:
TASK = "aric"

In [17]:
OUTPUT_DIR = os.path.join(cdcp_dir, "finetuned_models", f"""CDCP_{TASK}_{BASE_MODEL.split("/")[1]}""")

In [18]:
OUTPUT_DIR

'/Utilisateurs/umushtaq/am_work/coling_2025/cdcp/finetuned_models/CDCP_aric_llama-3-8b-Instruct-bnb-4bit'

In [20]:
NB_EPOCHS = 2

## Post-processing

In [124]:
import copy
import datasets
from datasets import load_dataset

In [77]:
cdcp_dataset = load_dataset("DFKI-SLT/cdcp", trust_remote_code=True)

In [78]:
cdcp_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'propositions', 'relations'],
        num_rows: 580
    })
    test: Dataset({
        features: ['id', 'text', 'propositions', 'relations'],
        num_rows: 150
    })
})

In [79]:
cdcp_dataset['test'][149]

{'id': '01418',
 'text': 'For further reference on debt tolling: __URL__ So, again, it seems as though the SOL matters very little in these cases. Perhaps the bigger question is whether or not eliminating debt tolling should be considered as a new rule.',
 'propositions': {'start': [0, 46, 120],
  'end': [46, 120, 227],
  'label': [2, 4, 4],
  'url': ['http://collectionagencydebt.blogspot.com/2012/08/tolling-debt-and-statute-of-limitations.html',
   '',
   '']},
 'relations': {'head': [], 'tail': [], 'label': []}}

In [80]:
nr_acs_l = []

for sample in cdcp_dataset['test']:
    nr_acs_l.append(len(sample['propositions']['label']))

In [81]:
len(nr_acs_l)

150

In [232]:
with open(os.path.join(OUTPUT_DIR, f"""CDCP_{TASK}_results_{NB_EPOCHS}.pickle"""), "rb") as fh:
        
        results = pickle.load(fh)

In [233]:
grounds = results["ground_truths"]

In [234]:
grounds = [json.loads(x)["relation_types"] for x in grounds]

In [235]:
preds = results["predictions"]

In [236]:
preds = [x["content"] for x in preds]

In [237]:
preds = [json.loads(x)["relation_types"] for x in preds]

In [238]:
len(grounds), len(preds)

(150, 150)

In [239]:
for i,(x,y) in enumerate(zip(grounds, preds)):
    
    if len(x) != len(y):
            
        print(i)

0
4
5
6
7
8
10
11
12
13
15
17
18
20
21
25
26
28
29
32
34
35
37
38
39
42
43
44
45
46
47
48
50
51
53
54
55
56
58
59
60
61
62
64
66
67
73
74
76
77
78
80
84
85
87
89
90
91
92
93
94
96
98
102
103
104
105
106
107
110
114
116
117
120
121
122
123
124
128
130
132
133
136
139
142
144
148
149


In [240]:
grounds_tmp = grounds[:]
preds_tmp = preds[:]

In [241]:
def get_all_relations(nr_acs_l, grounds_arg, preds_arg):

    
    #grounds_tmp = copy.copy(grounds)
    #preds_tmp = copy.copy(preds)
    
    final_grounds = []
    final_preds = []

    for idx, nr_acs in enumerate(nr_acs_l):
        
        current_grounds = grounds_arg[idx]
        current_grounds_st = [[x[0], x[1]] for x in current_grounds]
        current_preds = preds_arg[idx]
        current_preds_st = [[x[0], x[1]] for x in current_preds]       

        
        for i in range(nr_acs):
            for j in range(nr_acs):
                
                if i != j:
                    
                    st = [i, j]
                    
                    if st not in current_grounds_st:
                        current_grounds.append([st[0], st[1], "None"])

                    if st not in current_preds_st:
                        current_preds.append([st[0], st[1], "None"])

        current_grounds.sort()
        current_preds.sort()
        final_grounds.append(current_grounds)
        final_preds.append(current_preds)

    return final_grounds, final_preds

In [242]:
final_grounds, final_preds = get_all_relations(nr_acs_l, grounds_tmp, preds_tmp)

In [243]:
len(final_grounds), len(final_preds)

(150, 150)

In [244]:
final_grounds[0]

[[0, 1, 'None'],
 [0, 2, 'None'],
 [1, 0, 'None'],
 [1, 2, 'None'],
 [2, 0, 'None'],
 [2, 1, 'reason']]

In [245]:
final_preds[0]

[[0, 1, 'None'],
 [0, 2, 'None'],
 [1, 0, 'None'],
 [1, 2, 'None'],
 [2, 0, 'reason'],
 [2, 1, 'reason']]

In [246]:
final_grounds = [x for xs in final_grounds for x in xs]

In [247]:
final_preds = [x for xs in final_preds for x in xs]

In [248]:
len(final_grounds), len(final_preds)

(10328, 10328)

In [249]:
final_grounds = [x[2] for x in final_grounds]
final_preds = [x[2] for x in final_preds]

In [250]:
print(classification_report(final_grounds, final_preds, digits=3))

              precision    recall  f1-score   support

        None      0.979     0.983     0.981     10004
    evidence      0.000     0.000     0.000        26
      reason      0.363     0.342     0.352       298

    accuracy                          0.962     10328
   macro avg      0.447     0.442     0.444     10328
weighted avg      0.959     0.962     0.960     10328



In [48]:
def opposite_acc(component_type):

    if component_type == "fact":
        return "value"
    elif component_type == "value":
        return "policy"
    elif component_type == "policy":
        return "value"
    elif component_type == "testimony":
        return "fact"
    elif component_type == "reference":
        return "policy"


In [49]:
def harmonize_preds_acc(grounds, preds):

    l1, l2 = len(preds), len(grounds)
    if l1 < l2:
        diff = l2 - l1
        preds = preds + [opposite_acc(x) for x in grounds[l1:]]
    else:
        preds = preds[:l2]
        
    return preds 

In [50]:
for i,(x,y) in enumerate(zip(grounds, preds)):
    
    if len(x) != len(y):
            
        preds[i] = harmonize_preds_acc(x, y)

In [51]:
task_preds = [item for row in preds for item in row]
task_grounds = [item for row in grounds for item in row]

In [52]:
# sanity check: 
len(task_preds) == len(task_grounds)

True

## Results

In [53]:
print(classification_report(task_grounds, task_preds, digits=3))

              precision    recall  f1-score   support

        fact      0.596     0.750     0.664       132
      policy      0.883     0.889     0.886       153
   reference      1.000     1.000     1.000         1
   testimony      0.922     0.869     0.895       244
       value      0.872     0.835     0.853       496

    accuracy                          0.840      1026
   macro avg      0.855     0.868     0.860      1026
weighted avg      0.850     0.840     0.844      1026



In [54]:
with open(f"""{OUTPUT_DIR}/classification_report.pickle""", 'wb') as fh:
    
    pickle.dump(classification_report(task_grounds, task_preds, output_dict=True), fh)