In [1]:
import pickle

cases = pickle.load(open("processedCases.pickle", "rb"))

justiceId = "antonin_scalia"

justiceCases = [x for x in cases if justiceId in x["votes"].keys()]

In [2]:
len(justiceCases)

1830

In [3]:
len(cases)

2938

In [4]:
justiceCases[0]

{'plantiff': 'United States',
 'defendant': 'Cotton',
 'question': "Does the omission from a federal indictment of a fact that enhances the statutory maximum sentence justify a Court of Appeals' vacating the enhanced sentence, even though the defendant did not object in the trial court?",
 'facts': 'A federal grand jury returned an indictment charging Leonard defendant and others with conspiracy to distribute and to possess with intent to distribute a detectable amount of cocaine and cocaine base. After a jury convicted them, defendant and the others received a sentence based on the District Court\'s finding of drug quantity of at least 50 grams of cocaine base, which implicated certain enhanced penalties. They did not object in the District Court to the fact that the sentences were based on a quantity not alleged in the indictment. While their appeal was pending, the U.S. Supreme Court decided, in Apprendi v. New Jersey, 530 U.S. 466, that "other than the fact of a prior conviction, a

In [5]:
data = [{"question": x["question"], "facts": x["facts"], "vote": x["votes"][justiceId]} for x in justiceCases]
#data = [{"question": x["question"], "facts": x["facts"], "vote": x["inFavorPlantiff"]} for x in cases]

In [6]:
import pandas as pd

df = pd.DataFrame(data)

In [7]:
df.head()

Unnamed: 0,question,facts,vote
0,Does the omission from a federal indictment of...,A federal grand jury returned an indictment ch...,plantiff
1,"Does the ministerial exception, which prohibit...",Cheryl Perich filed a lawsuit against the plan...,plantiff
2,Is there a conceivable rational basis justifyi...,Section 602(7)(B) of the Cable Communications ...,plantiff
3,Did the Constable's action infringe upon defen...,Ardith defendant was a clerical employee in th...,plantiff
4,"Does the Court of Federal Claims, under the In...","Under Public Law 86-392, the former Fort Apach...",plantiff


In [8]:
df["text"] = df["question"] + " [SEP] " + df["facts"]

In [9]:
df["label"] = [1 if x == "plantiff" else 0 for x in df["vote"]]

In [10]:
df.head()

Unnamed: 0,question,facts,vote,text,label
0,Does the omission from a federal indictment of...,A federal grand jury returned an indictment ch...,plantiff,Does the omission from a federal indictment of...,1
1,"Does the ministerial exception, which prohibit...",Cheryl Perich filed a lawsuit against the plan...,plantiff,"Does the ministerial exception, which prohibit...",1
2,Is there a conceivable rational basis justifyi...,Section 602(7)(B) of the Cable Communications ...,plantiff,Is there a conceivable rational basis justifyi...,1
3,Did the Constable's action infringe upon defen...,Ardith defendant was a clerical employee in th...,plantiff,Did the Constable's action infringe upon defen...,1
4,"Does the Court of Federal Claims, under the In...","Under Public Law 86-392, the former Fort Apach...",plantiff,"Does the Court of Federal Claims, under the In...",1


In [11]:
df.describe()

Unnamed: 0,label
count,1830.0
mean,0.575956
std,0.494332
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

In [13]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[["text", "label"]])

In [14]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1830
})

In [15]:
dataset = dataset.train_test_split(test_size=0.2)

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, add_special_tokens=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

100%|██████████| 2/2 [00:00<00:00,  5.69ba/s]
100%|██████████| 1/1 [00:00<00:00, 17.19ba/s]


In [17]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=2)

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer", num_train_epochs=10, per_device_train_batch_size=8)

In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"]
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 1464
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1830
 27%|██▋       | 500/1830 [02:34<06:53,  3.21it/s]Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json


{'loss': 0.6977, 'learning_rate': 3.633879781420765e-05, 'epoch': 2.73}


Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
 55%|█████▍    | 1000/1830 [05:20<04:42,  2.94it/s]Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json


{'loss': 0.6939, 'learning_rate': 2.2677595628415303e-05, 'epoch': 5.46}


Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
 82%|████████▏ | 1500/1830 [08:11<01:52,  2.93it/s]Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json


{'loss': 0.6919, 'learning_rate': 9.016393442622952e-06, 'epoch': 8.2}


Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
100%|██████████| 1830/1830 [10:02<00:00,  2.89it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1830/1830 [10:02<00:00,  3.04it/s]

{'train_runtime': 602.3004, 'train_samples_per_second': 24.307, 'train_steps_per_second': 3.038, 'train_loss': 0.6933009580184853, 'epoch': 10.0}





TrainOutput(global_step=1830, training_loss=0.6933009580184853, metrics={'train_runtime': 602.3004, 'train_samples_per_second': 24.307, 'train_steps_per_second': 3.038, 'train_loss': 0.6933009580184853, 'epoch': 10.0})

In [20]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits = eval_pred[0]
    labels = eval_pred[1]
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [21]:
preds = trainer.predict(tokenized_datasets["test"])

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 366
  Batch size = 8
 98%|█████████▊| 45/46 [00:04<00:00,  8.71it/s]

In [22]:
compute_metrics(preds)

{'accuracy': 0.6010928961748634}

In [23]:
preds[0]

array([[-0.1670308 ,  0.14468601],
       [-0.16703062,  0.1446866 ],
       [-0.1670308 ,  0.14468594],
       [-0.16703025,  0.14468557],
       [-0.16703108,  0.14468566],
       [-0.16703057,  0.14468575],
       [-0.16703111,  0.14468518],
       [-0.16703083,  0.14468575],
       [-0.16703081,  0.14468549],
       [-0.16703105,  0.1446856 ],
       [-0.16703068,  0.14468612],
       [-0.16703066,  0.14468548],
       [-0.16703074,  0.1446858 ],
       [-0.1670307 ,  0.1446859 ],
       [-0.16703062,  0.144686  ],
       [-0.16703127,  0.14468554],
       [-0.1670307 ,  0.14468616],
       [-0.16703083,  0.1446856 ],
       [-0.16703063,  0.14468625],
       [-0.16703087,  0.14468625],
       [-0.16703041,  0.14468606],
       [-0.16703083,  0.1446855 ],
       [-0.16703068,  0.1446858 ],
       [-0.16703005,  0.14468598],
       [-0.1670307 ,  0.14468604],
       [-0.16703066,  0.1446853 ],
       [-0.16703106,  0.14468555],
       [-0.16703047,  0.14468612],
       [-0.1670303 ,

In [24]:
idx = 2

# Print the original sentence.
print('Original: ', dataset["train"][idx]["text"])
print()
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(dataset["train"][idx]["text"], padding="max_length", truncation=True, max_length=512, add_special_tokens=True))
print()
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dataset["train"][idx]["text"], padding="max_length", truncation=True, max_length=512, add_special_tokens=True)))


Original:  Can a federal district court conduct an independent review of the evidence of a state court’s finding of a constitutional aggravating circumstance?
Did Arizona’s construction of the “especially heinous…or depraved” aggravating circumstance contravene Supreme Court precedent? [SEP] In May of 1976, Jimmy Wayne defendant and his girlfriend, Penelope Cheney, were arrested for possession of narcotics and receipt of stolen property. defendant posted bond for Cheney but was unable to post bond for himself. While in jail, defendant learned that Cheney was cooperating with the police. He offered another inmate money to kill Cheney, but a detention officer seized the note. defendant was released on bond in October of 1976. He quickly contacted Cheney and invited her to his motel room to give her heroin. When Doris Van der Veer, the woman with whom defendant had been living since his release from prison, entered the room a few hours later, she saw Cheney comatose on the bed and defenda