In [1]:
##### Import general use tools
from collections import defaultdict
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import dill

# Import visualization libraries
import matplotlib.pyplot as plt
import matplotlib

import plotly
import plotly.express as px
import plotly.graph_objects as go

In [5]:
with open('kmb.pik', 'rb') as k:
    kmb = dill.load(k)
with open('lsa.pik', 'rb') as ll:
    lsa = dill.load(ll)

In [6]:
with open('binned_data.pik', 'rb') as bd:
    binned_data = dill.load(bd)

### First, retrieve necessary information.

+ Topics
+ Word counts
+ Years
+ Document names

In [None]:
# extract the frequency count for the top terms of the topic words 

# from the vocab within the vectorizer, for LSA:
print(lsa[(1870, 1880)]["vocab"].get("water"))

#...and for kmeans:
print(kmb[(1870, 1880)]["vocab"].get("water"))

In [None]:
# Get the word frequencies per year range for LSA and K means

print(f"lsa: {lsa[(1870, 1880)].keys()}")
print("\n")
print(f"kmeans: {kmb[(1870, 1880)].keys()}")

In [7]:
# Retrieve the top terms for each cluster from the dictionary, 
# with word counts from the vocab. 

# brute force... do this with classes and recursion

km_exclusions = [0,1,2,3,4,5,"labels","vocab","matrix",(1865, 1875),(1955, 1965)]

def get_top_terms_k(kmb, exclusions): # vocab_idx, dd_type
    keys = list(set(kmb.keys()) - set(exclusions))
    top_terms = defaultdict(list)
    for k in keys:
        for i in range(6):
            try:
                top_terms[k].append({i: {term: kmb[k]["vocab"].get(term) for term in kmb[k][i]}})
            except Exception as e:
                print(f"Exception: {e}")
    return top_terms
kmdict = get_top_terms_k(kmb, km_exclusions) # kmdict[(1980, 1990)][2][2].keys()

In [8]:
# NB: Matrix indices should equal case_dict indices for case names.

lsa_exclusions = [0, 1, 'vocab', (1955, 1965), (1865, 1875)] # where did these come from?
def get_top_terms(lsa, exclusions):
    keys = list(set(lsa.keys()) - set(exclusions))
    top_terms = defaultdict(list)
    for k in keys:
        try:
            top_terms[k].append({term: lsa[k]["vocab"].get(term) for term in lsa[k]["terms"]})
        except Exception as e:
            print(f"Exception: {e}")
    return top_terms
lsat = get_top_terms(lsa, lsa_exclusions)

In [9]:
# Match the cases back up to the labels in the kmeans clusters

for k in kmb.keys():
    print(f"KEY: {k}\nLABELS: {len(kmb[k]['labels'])}\nBINNED DATA: {len(binned_data[k])}\n\n")

KEY: (1870, 1880)
LABELS: 18
BINNED DATA: 18


KEY: (1880, 1890)
LABELS: 62
BINNED DATA: 62


KEY: (1890, 1900)
LABELS: 164
BINNED DATA: 164


KEY: (1900, 1910)
LABELS: 296
BINNED DATA: 296


KEY: (1910, 1920)
LABELS: 414
BINNED DATA: 414


KEY: (1920, 1930)
LABELS: 357
BINNED DATA: 357


KEY: (1930, 1940)
LABELS: 291
BINNED DATA: 291


KEY: (1940, 1950)
LABELS: 246
BINNED DATA: 246


KEY: (1950, 1960)
LABELS: 197
BINNED DATA: 197


KEY: (1960, 1970)
LABELS: 405
BINNED DATA: 405


KEY: (1970, 1980)
LABELS: 765
BINNED DATA: 765


KEY: (1980, 1990)
LABELS: 734
BINNED DATA: 734


KEY: (1990, 2000)
LABELS: 299
BINNED DATA: 299


KEY: (2000, 2010)
LABELS: 181
BINNED DATA: 181


KEY: (2010, 2020)
LABELS: 150
BINNED DATA: 150




KeyError: 'labels'

In [12]:
# Add the topics back in to the binned cases.
# kmb.pop((1865, 1875))
def topic_add(kmb, data):
    topic_add = defaultdict(list)
    for k in kmb.keys():
        topic_add[k].append(list(map(list, zip(data[k], kmb[k]["labels"]))))
    return topic_add
topic_and_data = topic_add(kmb, binned_data)

In [13]:
topic_and_data[(1870, 1880)][0][0]      # each entry is a list of case and topic; 
                                        #[0][0][0] = case info, [0][0][1] = topic 

[array(['Worthy v. Commissioners',
        'wall worthyvthe commissionerssupreme court united state mr boyce move dismiss writ want jurisdiction ground plaintiff errormr scheffer contra chief justice deliver opinion courtit manifest court jurisdiction present cause decision supreme court north carolina validity treaty act congress authority exercise united state favor validity statute authority exercise state allege repugnant constitution treaty law united statesit true brief counsel plaintiff urge right plaintiff protect st section th amendment right appear set specially claimed state court essential jurisdiction herewe authority therefore examine question present record must allow motion defendant error dismiss cause forwant jurisdiction'],
       dtype='<U349337'), 3]

In [None]:
# Check which case has which topic

for k in topic_and_data.keys():
    for case in topic_and_data[k][0]:
        doc_topic = case[1]
        print(f"{case[0][0]} has topic {doc_topic} in years {k}")

In [15]:
big_df = pd.DataFrame(columns=["year_1", "year_2", "title", "topic","terms", "term_frequency", "lsa_bow"])

for k in topic_and_data.keys():
    range_df = pd.DataFrame(columns=["year_1", "year_2", "title", "topic","terms", "term_frequency", "lsa_bow"])
    for i in range(len(topic_and_data[k][0])):
        range_df.loc[i] = {"year_1":k[0],"year_2":k[1],"title":topic_and_data[k][0][i][0][0],
                     "topic":topic_and_data[k][0][i][1],"terms":[kmb[k][topic_and_data[k][0][i][1]]],
                     "term_frequency":[kmdict[k][topic_and_data[k][0][i][1]]],"lsa_bow":[lsat[k]]}
    big_df = pd.concat([big_df, range_df], axis=0)

In [17]:
with open("big_df.pik", "wb")as bdf:
    dill.dump(big_df, bdf)

In [None]:
# Sanity check: Make sure the DataFrame matches up

for key in lsa.keys():
    print(lsa[key]["terms"])
print("-"*117)
for key in range(6):
    print(f"{key}: {kmb[(1870, 1880)][key]}")

In [24]:
# Create a pandas DataFrame to use with Plotly

big_df["years"] = list(zip(big_df["year_1"], big_df["year_2"]))

In [25]:
cols = ["title", "topic", "years", "term_frequency"]
terms = big_df[cols]

In [28]:
def flatten_all(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], str(a)) # name + ?
        elif type(x) is list:
            for a in x:
                flatten(a, a)
        else:
            out[str(name)] = str(x)

    flatten(y)
    return out

In [30]:
for col in ["term_frequency", "lsa_bow"]:
    big_df[col] = big_df[col].apply(lambda x: flatten_all(x))

In [None]:
big_df["terms"] = big_df["terms"].apply(lambda x: x[0])

In [31]:
big_df.head()

Unnamed: 0,year_1,year_2,title,topic,terms,term_frequency,lsa_bow,years
0,1870,1880,Worthy v. Commissioners,3,"[[authority exercise, jurisdiction, treaty, di...","{'authority exercise': '2750', 'jurisdiction':...","{'ship vessel': '31933', 'water': '36753', 'tr...","(1870, 1880)"
1,1870,1880,Osborn v. Nicholson,0,"[[slave, jurisdiction, cease, contract, pleadi...","{'slave': '32132', 'jurisdiction': '18601', 'c...","{'ship vessel': '31933', 'water': '36753', 'tr...","(1870, 1880)"
2,1870,1880,Bradwell v. State,2,"[[citizen, property, government, privilege, pe...","{'citizen': '5000', 'property': '27082', 'gove...","{'ship vessel': '31933', 'water': '36753', 'tr...","(1870, 1880)"
3,1870,1880,Slaughter-House Cases,2,"[[citizen, property, government, privilege, pe...","{'citizen': '5000', 'property': '27082', 'gove...","{'ship vessel': '31933', 'water': '36753', 'tr...","(1870, 1880)"
4,1870,1880,Bartemeyer v. Iowa,2,"[[citizen, property, government, privilege, pe...","{'citizen': '5000', 'property': '27082', 'gove...","{'ship vessel': '31933', 'water': '36753', 'tr...","(1870, 1880)"


### Plotly

In [32]:
#pd.DataFrame.from_dict(kmb)
#pd.DataFrame.from_dict(lsa)
#pd.DataFrame.from_dict(topic_and_data)
#pd.DataFrame.from_dict(lsat)

In [33]:
def flat(x):
    out = []
    for item in x:
        if isinstance(item, dict):
            out = flat([x[k] for k in x])
        elif isinstance(item, list):
            out = [flat[e] for e in x]
        else:
            out = x
    return out

flat(kdataframe.iloc[(0,0)])

{0: {'slave': 32132,
  'jurisdiction': 18601,
  'cease': 4381,
  'contract': 8023,
  'pleading': 25163,
  'warranty': 36717,
  'illinois': 16658,
  'order': 23343,
  'thomas': 34487,
  'raymond': 28678}}

In [34]:
kdataframe = pd.DataFrame.from_records(kmdict)

In [19]:
kdataframe.head(10)

Unnamed: 0,"(1870, 1880)","(1880, 1890)","(1890, 1900)","(1900, 1910)","(1910, 1920)","(1920, 1930)","(1930, 1940)","(1940, 1950)","(1950, 1960)","(1960, 1970)","(1970, 1980)","(1980, 1990)","(1990, 2000)","(2000, 2010)","(2010, 2020)"
0,"{0: {'slave': 32132, 'jurisdiction': 18601, 'c...","{0: {'injury': 38222, 'iowa': 39599, 'damage':...","{0: {'property': 127094, 'land': 91710, 'asses...","{0: {'depot': 70756, 'village': 245996, 'stati...","{0: {'tax': 226546, 'property': 182034, 'busin...","{0: {'commission': 38876, 'rate': 172836, 'cit...","{0: {'carrier': 37890, 'highway': 132409, 'veh...","{0: {'confession': 69537, 'ashcraft': 24832, '...","{0: {'trial': 274710, 'habeas': 124011, 'habea...","{0: {'search': 542864, 'arrest': 44212, 'offic...","{0: {'inmate': 546182, 'parole': 742535, 'pris...","{0: {'death': 280198, 'stay': 999911, 'executi...","{0: {'search': 654027, 'fourth amendment': 308...","{0: {'candidate': 79551, 'voter': 607598, 'pol...","{0: {'racial': 416204, 'voter': 557586, 'race'..."
1,"{1: {'passenger': 24168, 'commerce': 6094, 've...","{1: {'juror': 40788, 'jury': 40949, 'indictmen...","{1: {'railroad': 133114, 'corporation': 39519,...","{1: {'assessment': 17601, 'improvement': 11654...","{1: {'sale': 207664, 'liquor': 137159, 'sell':...","{1: {'trust': 213727, 'tax': 204902, 'trustee'...","{1: {'tax': 267406, 'property': 215727, 'trust...","{1: {'gas': 146161, 'commission': 61432, 'rate...","{1: {'tax': 265462, 'commerce': 49723, 'inters...","{1: {'apportionment': 38042, 'election': 20653...","{1: {'obscene': 708038, 'material': 647624, 'o...","{1: {'congress': 222862, 'tax': 1034136, 'acti...","{1: {'trial': 735475, 'defendant': 201855, 'ha...","{1: {'school': 511114, 'student': 546864, 'rel...","{1: {'trial': 534956, 'juror': 283056, 'defend..."
2,"{2: {'citizen': 5000, 'property': 27082, 'gove...","{2: {'congress': 15328, 'militia': 47326, 'yar...","{2: {'jury': 89505, 'verdict': 170504, 'eviden...","{2: {'power': 174652, 'contract': 55307, 'wate...","{2: {'employer': 82327, 'employee': 82147, 'ho...","{2: {'advocate': 4643, 'syndicalism': 204456, ...","{2: {'milk': 173852, 'power': 204669, 'contrac...","{2: {'jury': 185541, 'religious': 283968, 'pub...","{2: {'jury': 150797, 'confession': 56662, 'gra...","{2: {'trial': 612933, 'jury': 333411, 'counsel...","{2: {'school': 943066, 'child': 167786, 'publi...","{2: {'death': 280198, 'jury': 585792, 'death p...","{2: {'congress': 153567, 'government': 322948,...","{2: {'congress': 121567, 'government': 253978,...","{2: {'juvenile': 285439, 'parole': 364343, 'li..."
3,"{3: {'authority exercise': 2750, 'jurisdiction...","{3: {'railroad': 62070, 'tax': 73343, 'propert...","{3: {'citizen': 25268, 'power': 120990, 'comme...","{3: {'tax': 227479, 'property': 183387, 'taxat...","{3: {'land': 130494, 'water': 246470, 'canal':...","{3: {'public': 168424, 'business': 26481, 'pow...","{3: {'tax': 267406, 'city': 45247, 'business':...","{3: {'tax': 325712, 'property': 263246, 'estat...","{3: {'insurance': 142285, 'highway': 128494, '...","{3: {'confession': 124961, 'habeas': 274550, '...","{3: {'search': 948016, 'fourth amendment': 447...","{3: {'trial': 1065064, 'defendant': 291016, 'c...","{3: {'voting': 769286, 'election': 248940, 'ca...","{3: {'inmate': 296626, 'prison': 439672, 'osp'...","{3: {'cake': 74317, 'phillips': 374639, 'speec..."
4,"{4: {'trial': 34970, 'jury': 18737, 'suit comm...","{4: {'citizen': 11416, 'congress': 15328, 'cou...","{4: {'tax': 158087, 'taxation': 158911, 'bank'...","{4: {'insurance': 122522, 'policy': 172973, 'l...","{4: {'commission': 41955, 'railroad': 189513, ...","{4: {'tax': 204902, 'corporation': 52187, 'bus...","{4: {'tax': 267406, 'interstate': 146689, 'com...","{4: {'insurance': 174871, 'commerce': 60701, '...","{4: {'new': 178818, 'government': 120062, 'pub...","{4: {'school': 540403, 'congress': 127046, 'pu...","{4: {'jury': 585502, 'trial': 1067876, 'defend...","{4: {'school': 942679, 'religious': 890486, 'r...","{4: {'tax': 713438, 'speech': 682229, 'religio...","{4: {'jury': 317630, 'defendant': 158603, 'dea...","{4: {'jurisdiction': 281990, 'mcintyre': 31459..."
5,"{5: {'tax': 33814, 'tax duty': 33842, 'estate'...","{5: {'power': 55874, 'business': 9036, 'commer...","{5: {'writ': 175716, 'trial': 165412, 'jury': ...","{5: {'trial': 237278, 'jury': 129234, 'defenda...","{5: {'ordinance': 160260, 'city': 36912, 'stre...","{5: {'assessment': 15899, 'benefit': 21818, 'i...","{5: {'rate': 224726, 'commission': 51215, 'gas...","{5: {'counsel': 83817, 'habeas': 153443, 'habe...","{5: {'jeopardy': 147908, 'trial': 274710, 'pro...","{5: {'obscene': 405060, 'obscenity': 405285, '...","{5: {'jeopardy': 575240, 'double jeopardy': 34...","{5: {'search': 947391, 'police': 775935, 'chil...","{5: {'jury': 404410, 'death': 194186, 'mitigat...","{5: {'miranda': 363108, 'kaupp': 320695, 'arre...","{5: {'tax': 499905, 'taxpayer': 500138, 'flast..."


In [35]:
flatten_all(kdataframe[(1870, 1880)].iloc[3])["treaty"]

'34955'

In [38]:
for column in kdataframe.columns:
    for term in flatten_all(kdataframe[column].iloc[0]).keys():
        print(term)

slave
jurisdiction
cease
contract
pleading
warranty
illinois
order
thomas
raymond
injury
iowa
damage
railway
engineer
fence
negligence
corporation
railroad
liability
property
land
assessment
improvement
street
question
lot
tax
compensation
public
depot
village
station
emmons
passenger
railroad
fare
railway
road
depot station
tax
property
business
corporation
assessment
insurance
kentucky
taxation
question
policy
commission
rate
city
order
contract
water
public
service
ordinance
street
carrier
highway
vehicle
motor
common carrier
certificate
transportation
private carrier
public
motor vehicle
confession
ashcraft
officer
trial
police
evidence
mcnabb
jury
testimony
jail
trial
habeas
habeas corpus
transcript
corpus
appeal
illinois
writ
chessman
florida
search
arrest
officer
warrant
fourth amendment
fourth
evidence
police
probable cause
seizure
inmate
parole
prison
juvenile
prisoner
probation
transfer
hearing
probation officer
parolee
death
stay
execution
application stay
stay execution
dea

In [39]:
kdataframe[(1870, 1880)].iloc[0]

{0: {'slave': 32132,
  'jurisdiction': 18601,
  'cease': 4381,
  'contract': 8023,
  'pleading': 25163,
  'warranty': 36717,
  'illinois': 16658,
  'order': 23343,
  'thomas': 34487,
  'raymond': 28678}}

In [111]:
graph_df = pd.DataFrame(columns=["term","year_1","year_2","topic","frequency"])
for column in kdataframe.columns:
    for i in range(len(kdataframe)):
        terms = {"term":[],"year_1":[],"year_2":[],"topic":[],"frequency":[]}
        for key in flatten_all(kdataframe[column].iloc[i]).keys():
            terms["term"].append(key)
            terms["year_1"].append(column[0])
            terms["year_2"].append(column[1])
            terms["topic"].append(list(kdataframe[column].iloc[i].keys())[0])
            terms["frequency"].append(flatten_all(kdataframe[column].iloc[i])[key])
            
        term_df = pd.DataFrame.from_dict(terms)
        graph_df = pd.concat([graph_df, term_df], ignore_index=True)

In [114]:
graph_df.frequency.sort_values(ascending=False)

661     999911
804      99597
779      99527
38        9914
256      98830
752      98432
126      97723
878      97697
153      96602
636     955325
186      95518
708     951284
316      94920
630     948016
710     947391
620     943066
700     942679
658     931027
49        9294
855      92832
47        9244
121      91710
746       9131
628     905596
111       9036
534      90133
533      90125
140      89505
172      89505
701     890486
        ...   
588     113933
669    1137535
668    1137503
54       11273
489     112236
633    1121531
262     111986
719    1119186
348     111700
175     109662
33       10795
147     107047
653    1067876
641    1067876
690    1065064
606    1064183
649      10635
344     106261
58       10614
297     105428
352     105221
167     105004
169     104998
148     104261
629    1036415
376     103608
671    1034136
703    1007684
298     100708
664    1000003
Name: frequency, Length: 900, dtype: object

In [43]:
graph_df[graph_df.isna().any(axis=1)]

Unnamed: 0,term,year_1,year_2,topic,frequency


In [115]:
graph_df["frequency"] = graph_df["frequency"].apply(lambda x: int(x))

In [116]:
#graph_df["year_2"].apply(lambda x: int(x))
type(graph_df["term"].iloc[0])
graph_df["frequency"].sort_values(ascending=False)

669    1137535
668    1137503
633    1121531
719    1119186
641    1067876
653    1067876
690    1065064
606    1064183
629    1036415
671    1034136
703    1007684
664    1000003
661     999911
636     955325
708     951284
630     948016
710     947391
620     943066
700     942679
658     931027
628     905596
701     890486
702     889817
677     880340
622     842363
704     840292
678     840292
675     826944
608     812725
605     812666
        ...   
115      11416
100      11416
54       11273
33       10795
649      10635
58       10614
38        9914
49        9294
47        9244
746       9131
111       9036
3         8023
95        7971
15        6958
11        6094
13        5783
93        5601
35        5539
20        5000
84        4854
320       4643
2         4381
36        4348
17        4039
30        2750
34        2714
46        2225
239       2048
512       1414
513       1402
Name: frequency, Length: 900, dtype: int64

In [None]:
graph_df_2 = pd.DataFrame(columns=["year_1", "year_2", "title", "topic","terms", "term_frequency", "lsa_bow"])

for k in topic_and_data.keys():
    for i in range(len(topic_and_data[k][0])):
        terms = {"year_1":[],"year_2":[], "title":[],"topic":[],"terms":[], "term_frequency":[], "lsa_bow":[]}
        range_df.loc[i] = {"year_1":k[0],"year_2":k[1],"title":topic_and_data[k][0][i][0][0],
                     "topic":topic_and_data[k][0][i][1],"terms":[kmb[k][topic_and_data[k][0][i][1]]],
                     "term_frequency":[kmdict[k][topic_and_data[k][0][i][1]]],"lsa_bow":[lsat[k]]}
    # big_df_2 = pd.concat([big_df, range_df], axis=0)

In [129]:
# trim the dataframe
copy = graph_df.copy()
copy = copy[copy["frequency"] > 62259]
copy = copy[copy["frequency"] < 1137536]

In [135]:
##### import plotly.express as px
import plotly as py
import plotly.graph_objs as go

fig = px.bar(copy, x="term", y="frequency", text="term",hover_name="term", 
                 color="topic", animation_frame="year_1", animation_group="term",
             opacity=0.5, labels=None,template="plotly_dark",range_x=[-1,36], range_y=[0,1137535]) # barmode="overlay"
fig.update_xaxes(showticklabels=False)
fig.show()

import plotly.io as pio
pio.renderers.default = "notebook"
plotly.io.write_html(fig, 'freq_bar_graph.html')

In [None]:
with open("14th_am_kmeans.pik", "rb") as mf:
    cluster_14 = dill.load(mf)
    
