# Using the HRS index to find similar statutes

In [363]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction import dict_vectorizer
from nltk.corpus import stopwords
import gensim

__Load in the index JSON file__

In [None]:
index = json.load(open('../data/hrs.index.statutes.json'))

__We'll use the stopwords from nltk as a list of common english words to filter out__

In [191]:
stopword_set = set(stopwords.words('english'))

__For ever row in the index JSON file, collect all the keywords, remove stopwords, and create a mapping between statute -> keywords__

In [220]:
statute_to_keywords = {}

for row in index:
    statutes = row[-1]
    keywords = row[:-1]
    
    for statute in statutes:
        statute_to_keywords[statute] = set(gensim.utils.simple_preprocess("\n".join(keywords))) - stopword_set

In [221]:
len(statute_to_keywords)

17875

__Here, we're going to convert the statue->keywords mapping into a statute-keyword matrix using scikit's DictVectorizer class__

In [257]:
keys = []
dicts = []

for key, keyword_set in statute_to_keywords.items():
    keys.append(key)
    dicts.append( { k:True for k in keyword_set } )
    
m = dict_vectorizer.DictVectorizer().fit_transform(dicts)

__Now with the statute-keyword matrix, we'll compute all the pair-wise Jaccard distances between all of the statutes (takes a few minutes)__

In [270]:
%%time
dist = pairwise_distances(m.todense(), metric='jaccard')



CPU times: user 31min 12s, sys: 18.8 s, total: 31min 31s
Wall time: 31min 53s


In [271]:
dist.shape

(17875, 17875)

__We're left with a full-rank distance matrix, so for each row we can find the columns with the smallest distances, which correspond to the most 'similar' statutes__

In [312]:
# the diagonal of the distance matrix are all zeros (because the distance from a statute to itself is zero)
# so we'll fill the diagonal with nans so those cells will sort to the last spots
np.fill_diagonal(dist, np.nan)

In [365]:
# convert to numpy array for easier indexing
keys = np.array(keys)

__Sort each row of the distance matrix, and only keep the statutes with the 10 smallest distances__

In [377]:
similar_top10 = {}

for i, row in enumerate(dist):
    mask = ~(row == 1.) # mask out entries that had exactly nothing in common
    idx = np.argsort(row[mask]) # sort the row and keep the ordered index
    similar_top10[keys[i]] = list(keys[mask][idx][:10]) # fill the statute->similar mapping with the top 10

__Save to file__

In [378]:
json.dump(similar_top10,open('top_ten.json','w'))

In [381]:
!head -c 400 top_ten.json

{"397-11": ["396-13", "397-5", "431:2-209", "397-6", "397-1", "397-12", "397-2", "397-10", "397-8", "397-9"], "4605-23.5": ["46OJ-21", "4605-19", "4605-12", "4605-25", "460J-19", "46OJ-20", "4605-22", "46OJ-26", "46OJ-1", "4605-1"], "353-10.5": ["353-63.5", "353-17", "706-605.1", "657-14", "353-1", "353-26", "353-12", "706-672", "353-6", "354D-3"], "707-712.5": ["52D-8", "52D-6", "804-7.1", "52D-1