## TR question 2

Note: This requires running tr_q1 first

The order of the notebook is as follow, 
- Split train val test class
- Define a baseline
- Train model

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from typing import Dict
import pandas as pd
import os
from transformers import Trainer, TrainingArguments

from datasets import Dataset

seed = 42
MAX_LENGTH = 512
NUM_DATAPOINT = 100

  from .autonotebook import tqdm as notebook_tqdm


I take the headtext and firsts characters of each paragraph and concatenatem together, such that each row in the dataset are representend by `MAX_LENGTH` characters

### Create a dataset

In [2]:
file_path = 'TRDataChallenge2023.zip'
extract_file_path = 'TRDataChallenge2023'
df = pd.read_json(os.path.join(extract_file_path, f"{extract_file_path}.txt"), lines=True)
df = df.sample(231, random_state=42).reset_index(drop=True)

In [3]:
df

Unnamed: 0,documentId,postures,sections
0,I03d70fc0ce3311e8ae6bb4b0ae8dca5a,[Motion to Dismiss for Lack of Subject Matter ...,"[{'headtext': '', 'paragraphs': ['RANDOLPH, Se..."
1,I07f7893067c611ea9354eec9e02fecda,[Appellate Review],"[{'headtext': 'MEMORANDUM AND ORDER', 'paragra..."
2,Icf2416a2abce11e6b92bf4314c15140f,"[Motion to Renew, On Appeal]","[{'headtext': '', 'paragraphs': ['Appeal from ..."
3,Ic4f83620286111e8b25db53553f40f1b,[Review of Administrative Decision],"[{'headtext': 'MEMORANDUM AND JUDGMENT', 'para..."
4,Ia1075bf0779411e998e8870e22e55653,"[Appellate Review, Post-Trial Hearing Motion, ...","[{'headtext': 'OPINION', 'paragraphs': ['A pro..."
...,...,...,...
226,I71667ece6ba311e2a531ef6793d44951,[Review of Administrative Decision],"[{'headtext': '', 'paragraphs': ['Appeal from ..."
227,I6c50c6d0528211e9bed9c2929f452c46,"[On Appeal, Petition to Terminate Parental Rig...","[{'headtext': 'MEMORANDUM OPINION', 'paragraph..."
228,I428bd080630b11e98c7a8e995225dbf9,[Review of Administrative Decision],"[{'headtext': 'OPINION AND ORDER ', 'paragraph..."
229,I1b207830a00a11e888e382e865ea2ff8,"[Motion for Contempt, On Appeal]","[{'headtext': '', 'paragraphs': ['[¶1] Frank A..."


Normally we `fit_transform` in the train set and `transform` on the test set. However here I `fit_transform` in the whole dataset to cover all of the labels, because some of them only have one instance (See first notebook)

In [4]:
mlb = MultiLabelBinarizer()
labels = pd.Series(np.array(mlb.fit_transform(df["postures"].values), dtype="float").tolist(), name="label_ids")

In [5]:
df['label_ids'] = labels

In [6]:
def clean_up_strings(max_len, sections):
    """
    Remove the \\u and clean up texts
    """
    cleaned_sections = []
    chars_per_section = max_len // len(sections)
    for section in sections:            
        cleaned_text = ""
        headtext = [section['headtext'].encode("ascii", "ignore").decode().strip()]
        cleaned_paragraph = [paragraph.encode("ascii", "ignore").decode().strip() for paragraph in section['paragraphs']]
        cleaned_text += ". ".join(headtext + cleaned_paragraph)        
        
        if (len(cleaned_text) < chars_per_section):
            cleaned_sections.append(cleaned_text[:len(cleaned_text)])
        else:
            last_space_index = cleaned_text[:chars_per_section].rfind(' ')
            cleaned_sections.append(cleaned_text[:last_space_index])  # last element that is a space

    cleaned_sections = '. '.join(cleaned_sections)

    return cleaned_sections

In [7]:
def test_clean_up_strings():
    # Test the function with a basic scenario
    max_len = 50
    sections = [
        {
            'headtext': "Sample Headline",
            'paragraphs': ["This is the first paragraph."]
        },
        {
            'headtext': "Sample Headline",
            'paragraphs': ["Second paragraph."]
        }
    ]
    cleaned_sections = clean_up_strings(max_len, sections)    
    expected_result = 'Sample Headline. This is. Sample Headline. Second'    
    assert cleaned_sections == expected_result

test_clean_up_strings()

In [8]:
df.isna().sum()

documentId    0
postures      0
sections      0
label_ids     0
dtype: int64

In [9]:
df["cleaned_text"] = df['sections'].map(lambda x: clean_up_strings(MAX_LENGTH, x))

In [10]:
df

Unnamed: 0,documentId,postures,sections,label_ids,cleaned_text
0,I03d70fc0ce3311e8ae6bb4b0ae8dca5a,[Motion to Dismiss for Lack of Subject Matter ...,"[{'headtext': '', 'paragraphs': ['RANDOLPH, Se...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",". RANDOLPH, Senior Circuit Judge: Frank Palaci..."
1,I07f7893067c611ea9354eec9e02fecda,[Appellate Review],"[{'headtext': 'MEMORANDUM AND ORDER', 'paragra...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","MEMORANDUM AND ORDER. In 2013, defendant plead..."
2,Icf2416a2abce11e6b92bf4314c15140f,"[Motion to Renew, On Appeal]","[{'headtext': '', 'paragraphs': ['Appeal from ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",". Appeal from an order of the Supreme Court, K..."
3,Ic4f83620286111e8b25db53553f40f1b,[Review of Administrative Decision],"[{'headtext': 'MEMORANDUM AND JUDGMENT', 'para...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",MEMORANDUM AND JUDGMENT. Proceeding pursuant t...
4,Ia1075bf0779411e998e8870e22e55653,"[Appellate Review, Post-Trial Hearing Motion, ...","[{'headtext': 'OPINION', 'paragraphs': ['A pro...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",OPINION. A prosecutor. I.BACKGROUND. After a. ...
...,...,...,...,...,...
226,I71667ece6ba311e2a531ef6793d44951,[Review of Administrative Decision],"[{'headtext': '', 'paragraphs': ['Appeal from ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",. Appeal from a decision of the Workers' Compe...
227,I6c50c6d0528211e9bed9c2929f452c46,"[On Appeal, Petition to Terminate Parental Rig...","[{'headtext': 'MEMORANDUM OPINION', 'paragraph...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",MEMORANDUM OPINION. Father appeals the termina...
228,I428bd080630b11e98c7a8e995225dbf9,[Review of Administrative Decision],"[{'headtext': 'OPINION AND ORDER ', 'paragraph...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",OPINION AND ORDER. Plaintiff Sherry Ann Keiper...
229,I1b207830a00a11e888e382e865ea2ff8,"[Motion for Contempt, On Appeal]","[{'headtext': '', 'paragraphs': ['[¶1] Frank A...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",. [1] Frank A. Deede challenges an order denyi...


### Create a dataset

In [11]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForSequenceClassification.from_pretrained('poltextlab/xlm-roberta-large-english-legal-cap',
                                                           num_labels= len(mlb.classes_),
                                                           problem_type="multi_label_classification",
                                                           ignore_mismatched_sizes=True)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at poltextlab/xlm-roberta-large-english-legal-cap and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([22, 1024]) in the checkpoint and torch.Size([52, 1024]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([22]) in the checkpoint and torch.Size([52]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def tokenize(examples):
    encoding = tokenizer(examples["cleaned_text"], padding="max_length", truncation=True, max_length=512)    
    
    return encoding

In [13]:

train_dataset = Dataset.from_pandas(df)
tokenized_datasets = train_dataset.map(tokenize)
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2, seed=seed)


Map:   0%|          | 0/231 [00:00<?, ? examples/s]

Map: 100%|██████████| 231/231 [00:00<00:00, 1374.85 examples/s]


### Train

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


In [15]:

training_args = TrainingArguments(
    per_device_train_batch_size=3,
    output_dir='./output', 
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir='./logs',
    seed=seed
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()

  0%|          | 0/186 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  9%|▉         | 17/186 [05:21<1:02:26, 22.17s/it]

### Evaluation

#### From model

In [None]:
results = trainer.predict(tokenized_datasets["test"])

100%|██████████| 1/1 [00:00<?, ?it/s]


In [None]:
values = np.argmax(results.predictions, axis=1)
n_values = len(mlb.classes_)
prediction = np.eye(n_values)[values]

In [None]:
f1_score(y_true=tokenized_datasets["test"]['label_ids'], y_pred=prediction, average='weighted')

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.0

#### From a Baseline
We noticed from 1st question
- Most common class: Appellate Review
- Most common number of labels: 1

Therefore, the baseline would be to predict everything with "Appellate Review"

In [None]:
baseline_pred = mlb.transform([['Appellate Review']] * len(tokenized_datasets["test"]))

In [None]:
multilabel_confusion_matrix(y_true=tokenized_datasets["test"]['label_ids'], y_pred=baseline_pred)

array([[[0, 1],
        [0, 1]],

       [[2, 0],
        [0, 0]],

       [[2, 0],
        [0, 0]],

       [[2, 0],
        [0, 0]],

       [[2, 0],
        [0, 0]],

       [[1, 0],
        [1, 0]]], dtype=int64)

In [None]:
f1_score(y_true=tokenized_datasets["test"]['label_ids'], y_pred=baseline_pred, average='weighted')

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.3333333333333333