In [1]:
import argparse
import os
import sys
import numpy as np

from textacy.datasets.supreme_court import SupremeCourt
from pytorch_pretrained_bert import BertModel, BertTokenizer
import torch
import torch.nn as nn

In [2]:
sc = SupremeCourt()
# sc.download()
print('sc.info: ', sc.info)

sc.info:  {'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}


In [3]:
sc.issue_area_codes.keys()

dict_keys([-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

In [4]:
issue_codes = list(sc.issue_area_codes.keys()) # 15 labels
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]
# issue_codes

# dictionary mapping label name to numeric id
labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))
labels_index

{'-1': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9,
 '10': 10,
 '11': 11,
 '12': 12,
 '13': 13,
 '14': 14}

In [5]:
# Take a look at the format of the data.

tempRecord = next(sc.records())
type(tempRecord)

print('--------- The format of one record ---------')
print('length: ', len(tempRecord))
print('tempRecord[0] is the text: ', type(tempRecord[0]))
print('tempRecord[1] is the dict: ', tempRecord[1])

--------- The format of one record ---------
length:  2
tempRecord[0] is the text:  <class 'str'>
tempRecord[1] is the dict:  {'issue': '80180', 'issue_area': 8, 'n_min_votes': 1, 'case_name': 'HALLIBURTON OIL WELL CEMENTING CO. v. WALKER et al., DOING BUSINESS AS DEPTHOGRAPH CO.', 'maj_opinion_author': 78, 'decision_date': '1946-11-18', 'decision_direction': 'liberal', 'n_maj_votes': 8, 'us_cite_id': '329 U.S. 1', 'argument_date': '1946-01-09'}


In [6]:
texts = []  # list of text samples
labels = []  # list of label ids

for record in sc.records():
#     print(type(record))
#     print(len(record))
#     print(type(record[0]))
#     print(type(record[1]))
    if record[1]['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
#         print(labels_index[record[1]['issue'][:-4]])
        labels.append(labels_index[record[1]['issue'][:-4]])
    texts.append(record[0])

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(set(labels)))

Found 8419 texts.
Found 15 labels.


In [7]:
type(texts)
type(labels)

list

Load Bert in the following part.

In [28]:
# meta
training_epochs = 10
batch_size = 10
training_ratio = 0.8
doc_length = 500
dim = 768
n_classes = 15
# model
model = BertModel.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
classifier = nn.Linear(doc_length * dim, n_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(model.parameters()) +
                             list(classifier.parameters()))

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [29]:
# texts[0]

In [30]:
# len(tokenizer.tokenize(texts[0]))

In [31]:
temp_texts = texts[:10]

In [32]:
# Here we only take the first doc_length tokens.
segmented_texts = [tokenizer.tokenize(doc)[: doc_length - 1] + ['[SEP]'] * (
    doc_length - len(tokenizer.tokenize(doc))) + ['[SEP]'] for doc in temp_texts]
texts_ids = [tokenizer.convert_tokens_to_ids(doc) for doc in segmented_texts]
texts_ids_batches = [
    texts_ids[i * batch_size: (i + 1) * batch_size] for i in range(len(texts_ids) // batch_size)]
labels_batches = [torch.tensor(labels[i * batch_size: (i + 1) * batch_size],
                               dtype=torch.long) for i in range(len(labels) // batch_size)]

In [34]:
print(len(texts_ids))

10


In [35]:

texts_ids

[[164,
  1944,
  13292,
  12549,
  1320,
  9105,
  2119,
  24664,
  1880,
  1158,
  3291,
  119,
  191,
  119,
  4575,
  1828,
  119,
  4008,
  18757,
  1830,
  11157,
  117,
  1104,
  6809,
  117,
  23330,
  1233,
  119,
  113,
  3466,
  140,
  119,
  6284,
  1830,
  117,
  1104,
  1994,
  117,
  141,
  119,
  140,
  119,
  117,
  1113,
  1103,
  4094,
  114,
  117,
  1111,
  10077,
  1200,
  119,
  1828,
  119,
  6587,
  160,
  119,
  3895,
  15809,
  117,
  1104,
  2238,
  2460,
  117,
  11917,
  119,
  117,
  1111,
  6297,
  9857,
  119,
  1828,
  119,
  3302,
  139,
  10783,
  1658,
  2428,
  4653,
  1103,
  4893,
  1104,
  1103,
  2031,
  119,
  140,
  4047,
  2821,
  153,
  119,
  4575,
  117,
  3172,
  1104,
  16653,
  1302,
  119,
  123,
  117,
  17801,
  117,
  4062,
  1580,
  117,
  1105,
  1103,
  1168,
  6297,
  9857,
  117,
  5941,
  1279,
  1223,
  1103,
  8581,
  117,
  1814,
  1142,
  4228,
  1107,
  170,
  2877,
  1629,
  2175,
  26099,
  1115,
  10077,
  1200,
  117,

In [48]:
texts_ids_batches = texts_ids_batches[0]
# print(len(texts_ids_batches[0]))

In [49]:
# segmented_texts[0][400:]

In [50]:
training_text_batches = texts_ids_batches[: int(
    training_ratio * len(texts_ids_batches))]
training_label_batches = labels_batches[: int(
    training_ratio * len(labels_batches))]
test_text_batches = texts_ids_batches[int(
    training_ratio * len(texts_ids_batches)):]
test_label_batches = labels_batches[int(training_ratio * len(labels_batches)):]

In [52]:
# training_text_batches
print(len(training_text_batches))
# texts_ids_batches

8


In [53]:
# train
for _ in range(training_epochs):
#     print(len(training_text_batches))
    for batchid in range(len(training_text_batches)):
        texts_embeddings = torch.tensor(
            training_text_batches[batchid], dtype=torch.long)
        # segment_embeddings = torch.zeros_like(texts_embeddings)
        # print(segment_embeddings.size(), texts_embeddings.size())
        enc_out, _ = model(texts_embeddings, None)
        print(enc_out)
#         print(1)
        enc_out = enc_out[-1]
        predictions = torch.softmax(classifier(
            enc_out.view(enc_out.size(0), -1)), -1)
        loss = criterion(predictions, training_label_batches[batchid])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [41]:
# test
agreed = 0
total = 0
for batchid in range(len(test_text_batches)):
    texts_embeddings = model.embeddings(torch.tensor(
        test_text_batches[batchid], dtype=torch.long)).squeeze(1)
    enc_out, _ = model(texts_embeddings, None)
    enc_out = enc_out[-1]
    predictions = torch.softmax(classifier(
        enc_out.view(enc_out.size(0), -1)), -1)
    max_p, max_i = torch.max(predictions, -1)
    agreed += (max_i == test_label_batches).sum().data.numpy().tolist()
    total += batch_size
print('Test Accuracy: {}'.format(agreed / total))


RuntimeError: The expanded size of the tensor (768) must match the existing size (500) at non-singleton dimension 2.  Target sizes: [10, 500, 768].  Tensor sizes: [1, 500]