[Case Law Access Project API](https://case.law/api/)

In [2]:
import pandas as pd
cases = "https://raw.githubusercontent.com/mkirby1995/Datasets/master/court_cases.json"
df = pd.read_json(cases)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 6 columns):
case_id_numbers    100 non-null int64
case_names         100 non-null object
decision_dates     100 non-null object
court_names        100 non-null object
case_urls          100 non-null object
opinions           100 non-null object
dtypes: int64(1), object(5)
memory usage: 5.5+ KB


In [3]:
df['opinions'].head()

0     MAHONEY, Circuit Judge:\nPlaintiffs-appellants...
1     Opinion for the Court filed by Circuit Judge D...
10    PER CURIAM:\nArthur Calderon, Warden of the Ca...
11    RIPPLE, Circuit Judge.\nThis dispute between G...
12    LOGAN, Circuit Judge.\nPlaintiffs Richard E. N...
Name: opinions, dtype: object

In [49]:
df.head()

Unnamed: 0,case_id_numbers,case_names,decision_dates,court_names,case_urls,opinions,tokens
0,466,"De Jesus v. Sears, Roebuck & Co.",1996-06-25,United States Court of Appeals for the Second ...,https://cite.case.law/f3d/87/65/,"MAHONEY, Circuit Judge:\nPlaintiffs-appellants...",mahoney circuit judge plaintiff appellant ...
1,495,Competitive Telecommunications Ass'n v. Federa...,1996-07-05,United States Court of Appeals for the Distric...,https://cite.case.law/f3d/87/522/495/,Opinion for the Court filed by Circuit Judge D...,opinion for the court file by circuit judge ha...
10,893,Calderon v. U.S. District Court,1996-12-20,United States Court of Appeals for the Ninth C...,https://cite.case.law/f3d/103/72/,"PER CURIAM:\nArthur Calderon, Warden of the Ca...",per curiam arthur calderon warden of the c...
11,982,"Geske & Sons, Inc. v. National Labor Relations...",1997-01-09,United States Court of Appeals for the Seventh...,https://cite.case.law/f3d/103/1366/,"RIPPLE, Circuit Judge.\nThis dispute between G...",ripple circuit judge this dispute between ...
12,992,Norton v. Village of Corrales,1996-12-23,United States Court of Appeals for the Tenth C...,https://cite.case.law/f3d/103/928/,"LOGAN, Circuit Judge.\nPlaintiffs Richard E. N...",logan circuit judge plaintiff richard e ...


# Tokenize

In [5]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [20]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [21]:
spacy_stopwords.add('\n')
spacy_stopwords.add(':')
spacy_stopwords.add('')

In [26]:
import re

def tokenize(text):
    
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ^0-9]', ' ', str(text))
    doc = nlp.tokenizer(text)
    
    lemmas = [token.lemma_ + ' ' for token in doc]
    
    cleaned = []
    for token in lemmas:
        if token not in spacy_stopwords:
            cleaned.append(token)
            
    return ''.join(cleaned)

df['tokens'] = df['opinions'].apply(lambda x:tokenize(x))

df['tokens'].head()

0     mahoney   circuit judge   plaintiff appellant ...
1     opinion for the court file by circuit judge ha...
10    per curiam   arthur calderon   warden of the c...
11    ripple   circuit judge   this dispute between ...
12    logan   circuit judge   plaintiff richard e   ...
Name: tokens, dtype: object

# Vectorize

## Term Frequency-Inverse Document Frequency (TF-IDF)

“The term frequency of a term given a document, $tf(t,d)$, can be the boolean frequency (as in one-hot encoding, 1 if 
$t$ occurs in $d$ 0 otherwise), or the count. However, generally both the term frequency and inverse document frequency are scaled logarithmically to prevent bias of longer documents or terms that appear much more frequently relative to other terms: $tf(t,d) = 1 + \log(f)_{t,d}$.
Similarly, the inverse document frequency of a term given the set of documents can be logarithmically scaled as follows: $idf (t,D) = \log 1+ \frac{N}{n_t}$, where $N$ is the number of documents and $n_t$ is the number of occurrences of the term $t$ in all documents. TF–IDF is then computed completely as 
$$tfidf(t,d,D) = tf(t,d) * idf(t,D)$$

Because the ratio of the $id f \log$ function is greater or equal to 1, the TF–IDF score is always greater than or equal to zero. We interpret the score to mean that the closer the TF–IDF score of a term is to 1, the more informative that term is.


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
dtm = tfidf.fit_transform(df['tokens'].tolist())

docs = pd.DataFrame(dtm.todense(), columns = tfidf.get_feature_names())
docs.head()

Unnamed: 0,00,000,01,02,03,07,08,09,10,100,...,york,you,young,your,zahn,zenith,zetterstrom,zigzag,zinc,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002515,0.004256,...,0.013894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006346,0.004603,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002463
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004511,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005294,0.001493,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137655


In [67]:
for col in docs.columns:
    if docs[col].sum() > .7:
        print(col, docs[col].sum())

000 0.7116482141421662
15 0.8243289778526378
1972 0.8228086939879906
2d 3.0640224600652903
act 2.888576348288579
action 1.7483973884141593
against 0.8980134636742109
agreement 1.4461741378605928
all 1.2471511362361976
allege 0.9911186824796159
also 1.223086037028893
amendment 0.7364863784719232
and 13.430980724566734
antitrust 1.0434484259547856
any 2.0926406829703863
appeal 1.1028027823986266
apply 0.8176286654923184
at 4.711713878180744
attorney 1.1189817444258807
award 0.7084344512297354
bank 1.4417004007299008
be 19.509626950847107
because 1.195453840401732
between 0.7164743189088653
business 0.8650091902720293
but 1.2266171708650042
by 5.031684826886623
can 1.5382475862718465
carrier 0.7038386469185083
case 2.1741126620089535
cir 1.6528390124101797
claim 2.383647182606094
class 0.7243601204046918
co 1.3460174446707847
commerce 1.1746917125674623
commission 1.913707351026875
company 0.9674124389980734
competition 0.9849027163461703
complaint 0.9125793432018084
conduct 0.73090633623

In [63]:
from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors = 10, algorithm = 'ball_tree')

neighbors.fit(dtm.todense())
results = neighbors.kneighbors(docs.iloc[[0]])
results

(array([[0.        , 0.86205343, 0.86369088, 0.8665168 , 0.88033361,
         0.89490898, 0.90323447, 0.90411347, 0.90541258, 0.90769336]]),
 array([[ 0, 47, 40, 95, 82, 14, 39, 75, 43, 57]]))

In [64]:
for i in results[1][0]:
    print(df['case_names'].iloc[[i]], '\n')

0    De Jesus v. Sears, Roebuck & Co.
Name: case_names, dtype: object 

51    Great Northern Ry. Co. v. Brosseau
Name: case_names, dtype: object 

45    City of Lafayette v. Louisiana Power & Light Co.
Name: case_names, dtype: object 

95    Management Investors v. United Mine Workers
Name: case_names, dtype: object 

83    Rodriguez v. Banco Central
Name: case_names, dtype: object 

21    Mullen v. Sweetwater Development Corp.
Name: case_names, dtype: object 

44    National Society of Professional Engineers v. ...
Name: case_names, dtype: object 

77    Federal Maritime Commission v. Seatrain Lines,...
Name: case_names, dtype: object 

48    Vermont Yankee Nuclear Power Corp. v. Natural ...
Name: case_names, dtype: object 

60    United States Steel Corp. v. Fortner Enterpris...
Name: case_names, dtype: object 

