In [1]:
import json
from pprint import pprint
import random
import re
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import graphviz

## Loading Data

In [3]:
data_path = '../data/20171208_luima.json'

In [4]:
data = json.load(open(data_path))

In [5]:
type(data)

dict

In [6]:
data.keys()

dict_keys(['documents', 'types', 'annotations', 'objects'])

In [7]:
data['documents'][0].keys()

dict_keys(['_id', 'name', 'plainText'])

In [8]:
data['annotations'][0]

{'_id': '5a0dc3747a75cf2bafc0db8d',
 'attributes': [],
 'document': '59d3ed9544a09d7f8e4763d9',
 'end': 211,
 'owner': '58a0bf4f8424bd4f65e2be57',
 'start': 202,
 'type': '5a0ca67d7a75cf2bafc0d50d'}

In [9]:
data['types'][0]

{'_id': '59d3ec8e44a09d7f8e4763c5',
 'attributes': [],
 'isA': '58781cf945f90f3bfc5cba7d',
 'name': 'Citation'}

In [10]:
# move data into id-keyed dictionary
documents = {d['_id']: d for d in data['documents']}
types = {t['_id']: t for t in data['types']}
# move annotations into list
annotations = data['annotations']

In [11]:
# get all sentences assuming every annotation is a sentence
def make_sentence_data(documents, types, annotations):
    sentence_data = []
    for a in annotations:
        start = a['start']
        end = a['end']
        document_txt = documents[a['document']]['plainText']
        atype = a['type']
        sentence_txt = document_txt[start:end]
        sd = {'txt': sentence_txt,
              'type': types[atype]['name']}
        sentence_data.append(sd)
    return sentence_data

In [12]:
sentence_data = make_sentence_data(documents, types, annotations)

In [13]:
# look at examples
for _ in range(3):
    print(random.choice(sentence_data))

{'txt': 'However, the clinical evidence of record fails to show that the Veteran manifested arthritis to a degree of 10 percent within the one year following his active duty service discharge in August 1962.', 'type': 'EvidenceBasedReasoning'}
{'txt': "The examiner did not have any of the Veteran's medical records and offered no opinion as to a medical nexus between the veteran's current left knee disability and his in-service injury.", 'type': 'Evidence'}
{'txt': 'Hence, the Veteran is not shown to be prejudiced by the timing of this notice.', 'type': 'EvidenceBasedReasoning'}


In [14]:
len(sentence_data)

805

# Tokenizing text into individual words

In [15]:
def tokenize(txt):
    
    # split sentences on one or more whitespace characters
    dirty_tokens = re.split(' +', txt)
    
    # remove all non-alphanumeric characters and lowercase everything
    clean_tokens = []
    for t in dirty_tokens:
        clean_tokens.append(re.sub(r'\W', '', t).lower())
    
    # remove empty tokens
    if '' in clean_tokens:
        clean_tokens.remove('')
    
    return clean_tokens

In [16]:
# demo/test the tokenizer
#tokenize("Rather, the report of the Veteran's February 2006 VA examination stated that Veteran did not meet the criteria for PTSD.")

In [17]:
def tokenize_sentence_data(sentence_data):
    for s in sentence_data:
        s['tokens'] = tokenize(s['txt'])

In [18]:
tokenize_sentence_data(sentence_data)

In [19]:
# check post-tokenization
random.choice(sentence_data)

{'tokens': ['the',
  'board',
  'has',
  'first',
  'considered',
  'whether',
  'service',
  'connection',
  'is',
  'warranted',
  'on',
  'a',
  'presumptive',
  'basis'],
 'txt': 'The Board has first considered whether service connection is warranted on a presumptive basis.',
 'type': 'EvidenceBasedReasoning'}

# Building a vocabulary list

In [20]:
def build_vocabulary(sentences_data):
    vocabulary = set()
    for sd in sentences_data:
        vocabulary = vocabulary | set(tokenize(sd['txt']))
    return sorted(list(vocabulary))

In [21]:
vocabulary = build_vocabulary(sentence_data)

In [22]:
# look at vocabulary
# vocabulary

In [23]:
# test word
vocabulary.index('veteran')

2020

In [24]:
# vocabulary size
len(vocabulary)

2092

# Featurizing text

In [25]:
def featurize_sentence(s, vocabulary):
    features = []
    
    # binary features
    for v in vocabulary:
        if v in s['tokens']:
            features.append(1)
        else:
            features.append(0)
    featurized = {'txt': s['txt'],
                  'features': features,
                  'target': s['type']}
    return featurized
    
def featurize_sentence_data(sentence_data, vocabulary):
    return list(map(lambda s: featurize_sentence(s, vocabulary), sentence_data))

In [26]:
sentence_data_featurized = featurize_sentence_data(sentence_data, vocabulary)

In [27]:
#random.choice(sentence_data_featurized)

# Splitting our dataset

In [28]:
X = [fs['features'] for fs in sentence_data_featurized]
y = [fs['target'] for fs in sentence_data_featurized]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12345)

# Training a decision tree classifier

In [30]:
dtc = tree.DecisionTreeClassifier(max_depth=12)
clf = dtc.fit(X_train, y_train)

In [31]:
clf.classes_

array(['Citation', 'ConclusionOfLaw', 'Evidence', 'EvidenceBasedFinding',
       'EvidenceBasedReasoning', 'Header', 'LegalPolicy', 'LegalRule',
       'Procedure'],
      dtype='<U22')

## Output Decision Tree

In [32]:
dot_data = tree.export_graphviz(clf,
                                out_file=None,
                                feature_names=vocabulary,
                                class_names=clf.classes_,
                                rounded=True, 
                                filled=True)
graph = graphviz.Source(dot_data)
graph.render("bva-single")
#graph

'bva-single.pdf'

## Summarize Results

In [33]:
print(classification_report(y_train, clf.predict(X_train)))

                        precision    recall  f1-score   support

              Citation       1.00      0.99      0.99        97
       ConclusionOfLaw       0.92      0.61      0.73        18
              Evidence       0.84      0.94      0.89       179
  EvidenceBasedFinding       0.85      0.80      0.83        56
EvidenceBasedReasoning       0.76      0.72      0.74        82
                Header       1.00      0.43      0.60        54
           LegalPolicy       1.00      0.50      0.67        12
             LegalRule       0.92      0.88      0.90        78
             Procedure       0.61      0.88      0.72        68

           avg / total       0.86      0.84      0.83       644



In [34]:
print(classification_report(y_test, clf.predict(X_test)))

                        precision    recall  f1-score   support

              Citation       1.00      0.97      0.98        31
       ConclusionOfLaw       0.00      0.00      0.00         3
              Evidence       0.67      0.79      0.72        38
  EvidenceBasedFinding       0.14      0.09      0.11        11
EvidenceBasedReasoning       0.44      0.29      0.35        24
                Header       1.00      0.31      0.47        13
           LegalPolicy       0.67      0.40      0.50         5
             LegalRule       0.48      0.68      0.57        19
             Procedure       0.50      0.76      0.60        17

           avg / total       0.64      0.62      0.60       161



## Test Functions

In [38]:
def predict_sentence(txt, vocabulary, clf):
    s = {'txt': txt,
         'tokens': tokenize(txt),
         'type': 'unknown'}
    x = featurize_sentence(s, vocabulary)
    label = clf.predict([x['features']])
    return label

In [51]:
def false_predictions(sentence_data_featurized):
    for s in sentence_data_featurized:
        prediction = predict_sentence(s['txt'], vocabulary, clf)[0]
        true_label = s['target']
        if not prediction == true_label:
            print(s['txt'] + ' - true: ' + str(true_label) + '; predicted: ' + str(prediction))
            print()

## Error Analysis

In [None]:
false_predictions(sentence_data_featurized)

In [None]:
predict_sentence("The requirements for service connection have been met.", vocabulary, clf)