In [11]:
import json
from pprint import pprint
import random
import re
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import graphviz

## Loading Data

In [12]:
data_path = '../data/20171208_luima.json'

In [13]:
data = json.load(open(data_path))

In [14]:
type(data)

dict

In [15]:
data.keys()

dict_keys(['documents', 'types', 'annotations', 'objects'])

In [16]:
data['documents'][0].keys()

dict_keys(['_id', 'name', 'plainText'])

In [17]:
data['annotations'][0]

{'_id': '5a0dc3747a75cf2bafc0db8d',
 'attributes': [],
 'document': '59d3ed9544a09d7f8e4763d9',
 'end': 211,
 'owner': '58a0bf4f8424bd4f65e2be57',
 'start': 202,
 'type': '5a0ca67d7a75cf2bafc0d50d'}

In [18]:
data['types'][0]

{'_id': '59d3ec8e44a09d7f8e4763c5',
 'attributes': [],
 'isA': '58781cf945f90f3bfc5cba7d',
 'name': 'Citation'}

In [19]:
# move data into id-keyed dictionary
documents = {d['_id']: d for d in data['documents']}
types = {t['_id']: t for t in data['types']}
# move annotations into list
annotations = data['annotations']

In [20]:
# get all sentences assuming every annotation is a sentence
def make_sentence_data(documents, types, annotations):
    sentence_data = []
    for a in annotations:
        start = a['start']
        end = a['end']
        document_txt = documents[a['document']]['plainText']
        atype = a['type']
        sentence_txt = document_txt[start:end]
        sd = {'txt': sentence_txt,
              'type': types[atype]['name']}
        sentence_data.append(sd)
    return sentence_data

In [21]:
sentence_data = make_sentence_data(documents, types, annotations)

In [22]:
# look at examples
for _ in range(3):
    print(random.choice(sentence_data))

{'txt': "However, when the record contains a recent diagnosis of disability prior to the Veteran's filing of a claim for benefits based on that disability, the report of the diagnosis is relevant evidence that the Board must address in determining whether a current disability existed at the time the claim was filed or during its pendency.", 'type': 'LegalRule'}
{'txt': "This case was previously before the Board in November 2010, when it remanded the Veteran's claim in order to solicit additional information from the Veteran regarding a psychological evaluation completed in conjunction with a domestic dispute in May 1993.", 'type': 'Procedure'}
{'txt': 'Dingess/Hartman v. \nNicholson, 19 Vet. App. 473 (2006); 38 U.S.C.A. §§ 5100, \n5102, 5103, 5103A, 5106, 5107; 38 C.F.R. §§ 3.159, 3.326; see \nalso Pelegrini v. Principi, 18 Vet. App. 112, 120-21 (2004) \n(Pelegrini II).', 'type': 'Citation'}


In [23]:
len(sentence_data)

805

# Tokenizing text into individual words

In [24]:
def tokenize(txt):
    
    # split sentences on one or more whitespace characters
    dirty_tokens = re.split(' +', txt)
    
    # remove all non-alphanumeric characters and lowercase everything
    clean_tokens = []
    for t in dirty_tokens:
        clean_tokens.append(re.sub(r'\W', '', t).lower())
    
    # remove empty tokens
    if '' in clean_tokens:
        clean_tokens.remove('')
    
    return clean_tokens

In [25]:
# demo/test the tokenizer
#tokenize("Rather, the report of the Veteran's February 2006 VA examination stated that Veteran did not meet the criteria for PTSD.")

In [26]:
def tokenize_sentence_data(sentence_data):
    for s in sentence_data:
        s['tokens'] = tokenize(s['txt'])

In [27]:
tokenize_sentence_data(sentence_data)

In [28]:
# check post-tokenization
#random.choice(sentence_data)

# Building a vocabulary list

In [29]:
def build_vocabulary(sentences_data):
    vocabulary = set()
    for sd in sentences_data:
        vocabulary = vocabulary | set(tokenize(sd['txt']))
    return sorted(list(vocabulary))

In [30]:
vocabulary = build_vocabulary(sentence_data)

In [31]:
# look at vocabulary
# vocabulary

In [32]:
# test word
vocabulary.index('veteran')

2020

In [33]:
# vocabulary size
len(vocabulary)

2092

# Featurizing text

In [34]:
def featurize_sentence(s, all_s, vocabulary):
    features = []
    
    # binary features
    for v in vocabulary:
        if v in s['tokens']:
            features.append(1)
        else:
            features.append(0)
    featurized = {'features': features,
                  'target': s['type']}
    return featurized
    
def featurize_sentence_data(sentence_data, vocabulary):
    return list(map(lambda s: featurize_sentence(s, sentence_data, vocabulary), sentence_data))

In [35]:
sentence_data_featurized = featurize_sentence_data(sentence_data, vocabulary)

In [37]:
#random.choice(sentence_data_featurized)

# Splitting our dataset

In [40]:
X = [fs['features'] for fs in sentence_data_featurized]
y = [fs['target'] for fs in sentence_data_featurized]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12345)

# Training a decision tree classifier

In [42]:
dtc = tree.DecisionTreeClassifier(max_depth=12)
clf = dtc.fit(X_train, y_train)

In [43]:
clf.classes_

array(['Citation', 'ConclusionOfLaw', 'Evidence', 'EvidenceBasedFinding',
       'EvidenceBasedReasoning', 'Header', 'LegalPolicy', 'LegalRule',
       'Procedure'],
      dtype='<U22')

## Output Decision Tree

In [44]:
dot_data = tree.export_graphviz(clf,
                                out_file=None,
                                feature_names=vocabulary,
                                class_names=clf.classes_,
                                rounded=True, 
                                filled=True)
graph = graphviz.Source(dot_data)
graph.render("bva-single")
#graph

'bva-single.pdf'

In [45]:
print(classification_report(y_train, clf.predict(X_train)))

                        precision    recall  f1-score   support

              Citation       1.00      0.99      0.99        97
       ConclusionOfLaw       0.92      0.61      0.73        18
              Evidence       0.84      0.94      0.89       179
  EvidenceBasedFinding       0.82      0.80      0.81        56
EvidenceBasedReasoning       0.78      0.72      0.75        82
                Header       1.00      0.43      0.60        54
           LegalPolicy       1.00      0.50      0.67        12
             LegalRule       0.92      0.88      0.90        78
             Procedure       0.61      0.88      0.72        68

           avg / total       0.86      0.84      0.83       644



In [228]:
print(classification_report(y_test, clf.predict(X_test)))

                        precision    recall  f1-score   support

              Citation       1.00      0.97      0.98        31
       ConclusionOfLaw       0.00      0.00      0.00         3
              Evidence       0.67      0.82      0.74        38
  EvidenceBasedFinding       0.38      0.27      0.32        11
EvidenceBasedReasoning       0.64      0.29      0.40        24
                Header       1.00      0.31      0.47        13
           LegalPolicy       0.67      0.40      0.50         5
             LegalRule       0.48      0.68      0.57        19
             Procedure       0.50      0.76      0.60        17

           avg / total       0.68      0.64      0.63       161

