# Creating " word fields" for important parts-of-speech

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

from umap import UMAP
from sklearn.cluster import OPTICS
from sklearn.decomposition import PCA

from extract import *

with open('features/version1.pkl', 'rb') as p:
    features = pickle.load(p)

word2vec = LegalVectors()
vocabulary = {'N' : {'word' : [], 'vec' : []}, 'V' : {'word' : [], 'vec' : []}, 'S' : {'word' : [], 'vec' : []}, 'A' : {'word' : [], 'vec' : []}}
# phrase2vec = {}

In [2]:
for chf in features:

    for n in chf.context_symbolic.nouns:
        if n not in vocabulary['N']['word']:
            vocabulary['N']['word'].append(n)
            vocabulary['N']['vec'].append(word2vec[n])

    for v in chf.context_symbolic.verbs:
        if v not in vocabulary['V']['word']:
            vocabulary['V']['word'].append(v)
            vocabulary['V']['vec'].append(word2vec[v])

    for s in chf.context_symbolic.subjs:
        if s not in vocabulary['S']['word']:
            vocabulary['S']['word'].append(s)
            vocabulary['S']['vec'].append(word2vec[s])

    for a in chf.context_symbolic.amods:
        if a not in vocabulary['A']['word']:
            vocabulary['A']['word'].append(a)
            vocabulary['A']['vec'].append(word2vec[a])

    for ans_sym in chf.choices_symbolic:
        for n in ans_sym.nouns:
            if n not in vocabulary['N']['word']:
                vocabulary['N']['word'].append(n)
                vocabulary['N']['vec'].append(word2vec[n])

        for v in ans_sym.verbs:
            if v not in vocabulary['V']['word']:
                vocabulary['V']['word'].append(v)
                vocabulary['V']['vec'].append(word2vec[v])

        for s in ans_sym.subjs:
            if s not in vocabulary['S']['word']:
                vocabulary['S']['word'].append(s)
                vocabulary['S']['vec'].append(word2vec[s])

        for a in ans_sym.amods:
            if a not in vocabulary['A']['word']:
                vocabulary['A']['word'].append(a)
                vocabulary['A']['vec'].append(word2vec[a])

In [3]:
def L1_SOW(vectors):
    sum_ = np.sum(vectors)
    return sum_ / np.linalg.norm(sum)

def all_but_the_top(v, D):
      """
      All-but-the-Top: Simple and Effective Postprocessing for Word Representations
      https://arxiv.org/abs/1702.01417
      Arguments:
          :v: word vectors of shape (n_words, n_dimensions)
          :D: number of principal components to subtract
      """
      # 1. Subtract mean vector
      v_tilde = v - np.mean(v, axis=0)
      # 2. Compute the first `D` principal components
      #    on centered embedding vectors
      u = PCA(n_components=D).fit(v_tilde).components_  # [D, emb_size]
      # Subtract first `D` principal components
      # [vocab_size, emb_size] @ [emb_size, D] @ [D, emb_size] -> [vocab_size, emb_size]
      return v_tilde - (v @ u.T @ u)  

## Nouns

In [4]:
words, vecs = vocabulary['N']['word'], all_but_the_top(np.vstack(vocabulary['N']['vec']), 1)

x2d = UMAP(n_components=8, n_neighbors=3, min_dist=0.01, metric='cosine', random_state=444).fit_transform(vecs)
optics = OPTICS(xi=0.1, min_cluster_size=0.01).fit(x2d)

label2words, label2super = {}, {}
for label, word in zip(optics.labels_, words):
  if label == -1:
    continue
  elif label not in label2words:
    label2words[label] = []
    label2super[label] = []

  label2words[label].append(word)
  label2super[label].append(word2vec[word])

for label, subvecs in label2super.items():
  label2super[label] = np.mean(np.array(subvecs), axis=0)

pprint(label2words)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


{0: ['robbery',
     'assault',
     'negligence',
     'accomplice',
     'misconduct',
     'rape',
     'murder',
     'abuse',
     'girl',
     'molestation',
     'omission',
     'misrepresentation',
     'fraud',
     'conspiracy',
     'informant',
     'bias',
     'pornography',
     'neglect',
     'theft',
     'sodomy',
     'perjury',
     'loitering',
     'burglary',
     'contravention',
     'larceny',
     'manslaughter',
     'enticement',
     'suicide',
     'battery',
     'mismanagement',
     'privity',
     'indifference',
     'homicide',
     'malice',
     'insanity',
     'wrongdoing',
     'obscenity',
     'draftsman',
     'seduction',
     'corruption',
     'spoliation',
     'usury',
     'falsity',
     'embezzlement',
     'killing',
     'impropriety',
     'mischief',
     'misappropriation',
     'cartel',
     'turpitude',
     'hate',
     'unfitness',
     'instrumentality',
     'intercourse',
     'palm',
     'arson',
     'deceit',
     

In [5]:
with open('knowledge/nouns/words.pkl', 'wb') as d:
  pickle.dump(label2words, d)

with open('knowledge/nouns/super.pkl', 'wb') as d:
  pickle.dump(label2super, d)

## Verbs

In [6]:
words, vecs = vocabulary['V']['word'], all_but_the_top(np.vstack(vocabulary['V']['vec']), 1)

x2d = UMAP(n_components=8, n_neighbors=3, min_dist=0.01, metric='cosine', random_state=444).fit_transform(vecs)
optics = OPTICS(xi=0.1, min_cluster_size=0.01).fit(x2d)

label2words, label2super = {}, {}
for label, word in zip(optics.labels_, words):
  if label == -1:
    continue
  elif label not in label2words:
    label2words[label] = []
    label2super[label] = []

  label2words[label].append(word)
  label2super[label].append(word2vec[word])

for label, subvecs in label2super.items():
  label2super[label] = np.mean(np.array(subvecs), axis=0)

pprint(label2words)

{0: ['see',
     'compare',
     'mar',
     'join',
     'associate',
     'eas',
     'post',
     '-',
     'd',
     'burn',
     'ante',
     'e',
     'sic',
     'app',
     'rejoin',
     's',
     'ussg',
     'm',
     '[',
     'ex',
     'restatement',
     'stat',
     'h',
     'joined',
     'k',
     '§',
     't',
     'rcra',
     'r',
     'p'],
 1: ['underlie',
     'quote',
     'recite',
     'relegate',
     'collide',
     'repose',
     'predate',
     'overlook',
     'militate',
     'engraft',
     'reconcile',
     'inhere',
     'comport',
     'carve',
     'disavow',
     'rewrite',
     'analogize',
     'bother',
     'countenance',
     'prejudge',
     'misread',
     'infuse',
     'conflate',
     'envision',
     'subvert',
     'evince',
     'delve',
     'animate',
     'postdate',
     'vitiate',
     'culminate',
     'reassert',
     'criticize',
     'thwart',
     'recharacterize',
     'grapple',
     'wag',
     'misinterpret',
     'exp

In [7]:
with open('knowledge/verbs/words.pkl', 'wb') as d:
  pickle.dump(label2words, d)

with open('knowledge/verbs/super.pkl', 'wb') as d:
  pickle.dump(label2super, d)

## Subjects

In [8]:
words, vecs = vocabulary['S']['word'], all_but_the_top(np.vstack(vocabulary['S']['vec']), 1)

x2d = UMAP(n_components=8, n_neighbors=3, min_dist=0.01, metric='cosine', random_state=444).fit_transform(vecs)
optics = OPTICS(xi=0.1, min_cluster_size=0.01).fit(x2d)

label2words, label2super = {}, {}
for label, word in zip(optics.labels_, words):
  if label == -1:
    continue
  elif label not in label2words:
    label2words[label] = []
    label2super[label] = []

  label2words[label].append(word)
  label2super[label].append(word2vec[word])

for label, subvecs in label2super.items():
  label2super[label] = np.mean(np.array(subvecs), axis=0)

pprint(label2words)

{0: ['hastings',
     'kentucky',
     'delaware',
     'pennsylvania',
     'commonwealth',
     'brady',
     'cincinnati',
     'york',
     'utah',
     'richmond',
     'tower',
     'don',
     'montana',
     'texas',
     'hampshire',
     'miranda',
     'borough',
     'strand',
     'massachusetts',
     'missouri',
     'michigan',
     'fargo',
     'wachovia',
     'puc',
     'chase',
     'pac',
     'temple',
     'cbs',
     'virginia',
     'mississippi',
     'hart',
     'wisconsin',
     'edison',
     'oklahoma',
     'colony',
     'wyoming',
     'england',
     'kansas',
     'boston',
     'carolina'],
 1: ['sentence',
     'term',
     'reservation',
     'nonemployee',
     'sanction',
     'tribe',
     'area',
     'imposition',
     'death',
     'municipality',
     'lady',
     'imprisonment',
     'nonmember',
     'penalty',
     'field',
     'nation',
     'locality',
     'outside',
     'tea',
     'fine',
     'province',
     'indian',
     'co

In [9]:
with open('knowledge/subjs/words.pkl', 'wb') as d:
  pickle.dump(label2words, d)

with open('knowledge/subjs/super.pkl', 'wb') as d:
  pickle.dump(label2super, d)

## Adjectives

In [10]:
words, vecs = vocabulary['A']['word'], all_but_the_top(np.vstack(vocabulary['A']['vec']), 1)

x2d = UMAP(n_components=8, n_neighbors=3, min_dist=0.01, metric='cosine', random_state=444).fit_transform(vecs)
optics = OPTICS(xi=0.1, min_cluster_size=0.01).fit(x2d)

label2words, label2super = {}, {}
for label, word in zip(optics.labels_, words):
  if label == -1:
    continue
  elif label not in label2words:
    label2words[label] = []
    label2super[label] = []

  label2words[label].append(word)
  label2super[label].append(word2vec[word])

for label, subvecs in label2super.items():
  label2super[label] = np.mean(np.array(subvecs), axis=0)

pprint(label2words)

{0: ['nonviolent',
     'punishable',
     'pretrial',
     'appealable',
     'challenge',
     'resentence',
     'right',
     'injunctive',
     'dismiss',
     'guilty',
     'judgment',
     'arbitral',
     'procedural',
     'interlocutory',
     'seconddegree',
     'prepetition',
     'deadly',
     'responsible',
     'standing',
     'nonmonetary',
     'hate',
     'suspicious',
     'aggravated',
     'facial',
     'ineffectiveness',
     'habeas',
     'alleged',
     'declaratory',
     'conviction',
     'reviewable',
     'remand',
     'punitive',
     'compensatory',
     'petty',
     'claim',
     'noncapital',
     'firstdegree',
     'nonhomicide',
     'inverse',
     'peremptory',
     'liable',
     'monetary',
     'plea',
     'pro',
     'racist',
     'determinate',
     'mandate',
     'retarded',
     'grind',
     'appeal',
     'assault',
     'blameless',
     'award',
     'filing',
     'inflammatory',
     'case',
     'triable',
     'accuse',
 

In [11]:
with open('knowledge/adjs/words.pkl', 'wb') as d:
  pickle.dump(label2words, d)

with open('knowledge/adjs/super.pkl', 'wb') as d:
  pickle.dump(label2super, d)

In [17]:
with open('knowledge/nouns/super.pkl', 'rb') as p:
    supernn = list(pickle.load(p).values())

with open('knowledge/verbs/super.pkl', 'rb') as p:
    supervb = list(pickle.load(p).values())

with open('knowledge/subjs/super.pkl', 'rb') as p:
    supersj = list(pickle.load(p).values())

with open('knowledge/adjs/super.pkl', 'rb') as p:
    superad = list(pickle.load(p).values())

In [19]:
features_test = features

In [24]:
print(features_test[0].context_symbolic)

encode_symbolic(features_test[0].context_symbolic, supernn, supervb, supersj, superad)

SymbolicFeatures(nouns={'violence', 'person', 'possession', 'lack', 'victim', 'felony', 'use', 'risk', 'nature', 'sentence', 'property', 'purpose', 'imprisonment', 'bomb', 'offender', 'fact', 'explosive', 'career', 'term', 'court', 'cohort', 'offense', 'crime', 'year', 'holding'}, verbs={'see', 'base', 'involve', 'define', 'be', 'find', 'exceed', 'qualify', 'enhance', 'use', 'make'}, subjs={'court', 'cohort', 'possession', 'offense', 'that'}, amods={'nonviolent', 'substantial', 'very', 'peaceful', 'violent', 'punishable'}, phrases=[{'cohort', 'would', 'be', 'cohorts'}, {'crimes', 'qualify', 'felony', 'purposes', 'offenses'}, {'enhancing', 'sentences', 'offenders', 'career'}, {'term', 'defining', 'crime', 'felony', 'imprisonment'}, {'involves', 'use', 'explosives'}, {'have', 'courts', 'found', 'fact'}, {'violence', 'possession', 'be', 'to'}, {'purpose', 'based', 'bomb', 'lack'}, {'be', 'property', 'used', 'would', 'bomb', 'person'}], vector=[])


# Postprocessing

In [23]:
def cosine_sims(query, supers):
    norm = np.linalg.norm(query)
    all_norms = np.linalg.norm(supers, axis=1)
    dot_products = np.dot(supers, query)
    similarities = dot_products / (norm * all_norms)
    return similarities

def encode_symbolic(sf, supernouns, superverbs, supersubjs, superadjs):

  nmax, vmax, smax, amax = np.zeros(len(supernouns)), np.zeros(len(superverbs)), np.zeros(len(supersubjs)), np.zeros(len(superadjs))
  nmin, vmin, smin, amin = np.ones(len(supernouns)), np.ones(len(superverbs)), np.ones(len(supersubjs)), np.ones(len(superadjs))

  for n in sf.nouns:
    sims = cosine_sims(word2vec[n], np.vstack(supernouns))
    nmax, nmin = np.maximum(nmax, sims), np.minimum(nmin, sims)

  for v in sf.verbs:
    sims = cosine_sims(word2vec[v], np.vstack(superverbs))
    vmax, vmin = np.maximum(vmax, sims), np.minimum(vmin, sims)

  for s in sf.subjs:
    sims = cosine_sims(word2vec[s], np.vstack(supersubjs))
    smax, smin = np.maximum(smax, sims), np.minimum(smin, sims)

  for a in sf.amods:
    sims = cosine_sims(word2vec[a], np.vstack(superadjs))
    amax, amin = np.maximum(amax, sims), np.minimum(amin, sims)

  sf.vector = np.concatenate([nmax, nmin, vmax, vmin, smax, smin, amax, amin])