In [2]:
# Import tfidf and normalizer for bag of words and pre-processing
# Import LDA, K-means clustering, NMF, and LSA 
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

# Import general use tools
from collections import defaultdict
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import time
import dill

# Import visualization libraries
import matplotlib.pyplot as plt
import matplotlib
import plotly.express as px
import plotly
import random

import warnings 
warnings.filterwarnings('ignore')

In [3]:
# Import stop_words and create list to hold additions
my_additional_stop_words=['15','2628','446','2607','419''____','petition','supreme','rehearing','sugg','plaintiff',
                          'error','employés', '000','ch','said','company','united', 'federal', 'district', 'right',
                          'id', 'opinion','law', 'case', 'state', 'court','sentence','petitioner','pub','280','ch',
                          'statute','case','ct', 'mr', 'ถถ', 'งง', 'zzz','supra','infra','appellant','appellee', 'id',
                          '413', '93', '37','1973','act', 'make', 'ante', 'cite', 'claim', 'respondent','rule','shall',
                          'judgment','say', 'ed', '2d', 'ct','rev','sup','rep','new york','york']

# Update the in-built stopwords list
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

In [4]:
df = pd.read_csv('Corpora/14th_cleaned_corpora.csv')

In [5]:
df.sort_values(by="year", inplace=True)

In [6]:
# one missing year
df.at[1807, 'year'] = 2019

In [9]:
df.resource_uri.iloc[977]

'https://www.courtlistener.com:80/api/rest/v3/clusters/99548/'

### Create a Bag of Words with TF-IDF

Get columns required for analysis

In [10]:
cols = ['id_x', 'year', 'case_name', 'corpora']

In [11]:
corp = df[cols].copy()

In [12]:
corp

Unnamed: 0,id_x,year,case_name,corpora
1523,88200,1870.0,Worthy v. Commissioners,wall worthyvthe commissionerssupreme court uni...
1789,88503,1872.0,Osborn v. Nicholson,wall osbornvnicholson et alsupreme court unite...
3851,88662,1873.0,Bradwell v. State,wall bradwellvthe statesupreme court united st...
725,88661,1873.0,Slaughter-House Cases,wall slaughter-house casesthe butcher benevole...
250,88800,1874.0,Bartemeyer v. Iowa,wall bartemeyerviowasupreme court united state...
2508,88998,1875.0,Minor v. Happersett,wall minorvhappersettsupreme court united stat...
1045,89115,1875.0,Scholey v. Rew,wall scholeyvrewsupreme court united state mr ...
3904,89233,1876.0,Raymond v. Thomas,raymondvthomassupreme court united statesmr p ...
4491,89245,1876.0,Walker v. Sauvinet,walkervsauvinetsupreme court united state mr c...
422,89266,1876.0,United States v. REESE,united statesvreese et alsupreme court united ...


In [13]:
# fit method creates bag of words. See them with .get_feature_names()
# This returns a sparse matrix. For a dense matrix, you could perform:
# pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

# Create a bag of words using ngrams up to 3 words long

vect = CountVectorizer(ngram_range=(1,3))

In [14]:
def make_bow(df_col):
    #custom_vec = CountVectorizer(ngram_range=(1,3))
    custom_vec = CountVectorizer(stop_words, strip_accents="unicode")
    corpora = df_col.values.flatten().tolist()
    wm = custom_vec.fit_transform(corpora)
    vocab = custom_vec.vocabulary_
    tokens = custom_vec.get_feature_names()
    #df = wm_to_df(wm, tokens)
    
    return wm, tokens, vocab #df, tokens
# Create the matrix and get the features and vocab on the whole corpus
wordmatrix, features, vocab = make_bow(corp['corpora'])

In [15]:
# Create a dataframe of bag of words vocabulary
def bow_df(vocab):
    vocab_values = list(vocab.values())
    vocab_keys = list(vocab.keys())
    count_df = pd.DataFrame(list(zip(vocab_keys,vocab_values)))
    count_df.columns = ['Word', 'Count']
    count_df.sort_values(by='Count', ascending=False, inplace=True)
    return count_df

# count_df = bow_df(vocab)

## Perform Latent Semantic Analysis on the corpus as a whole

In [16]:
## Create a pipeline to perform LSA
## Create a vectorizer to convert raw documents to TF/IDF matrix
#vectorizer = TfidfVectorizer(stop_words=stop_words,
#                             strip_accents="unicode",
#                             ngram_range=(1,2),
#                             use_idf=True, 
#                             smooth_idf=True)
#
## This normalizes the vector (L2 norm of 1.0) to neutralize 
## the effect of document length on tf-idf.
#
#normalizer = Normalizer(copy=False)
#
## Perform singular value decomposition:
## Project the tfidf vectors onto the first N principal components.
#
#svd_model = TruncatedSVD(n_components=100,         # number of dimensions
#                         algorithm='randomized',
#                         n_iter=10)
#
#lsa_transformer = Pipeline([('tfidf', vectorizer), 
#                            ('svd', svd_model),
#                            ('norm', normalizer)])

In [17]:
def lsa_transform(corpora):
    lsa_matrix = lsa_transformer.fit_transform(corpora)
    print(f"tf-idf params: {lsa_transformer.steps[0][1].get_params()}")

    # Get the words that correspond to each of the features.
    feat_names = lsa_transformer.steps[0][1].get_feature_names()
    vocab = lsa_transformer.steps[0][1].vocabulary_
    
    # Plot the top 10 terms for each top-10 LSA component. 
    for component_num in range(0, 10): # i.e., the top 10 components.
    
        comp = lsa_transformer.steps[1][1].components_[component_num]
        
        # Sort the weights in the first component and get indices
        indices = np.argsort(comp).tolist()
        
        # Reverse order (largest weights first)
        indices.reverse()
        
        # Get top 10 terms for each component        
        terms = [feat_names[weight_index] for weight_index in indices[0:10]]    
        weights = [comp[weight_index] for weight_index in indices[0:10]]    
       
        # Display these terms and their weights as a horizontal bar graph.    
        # The horizontal bar graph displays the first item on the bottom; reverse
        # the order of the terms so the biggest one is on top.
        terms.reverse()
        weights.reverse()
        positions = np.arange(10) + .5    # Center the bar on the y axis.
        
        plt.figure(component_num)
        plt.barh(positions, weights, align="center")
        plt.yticks(positions, terms)
        plt.xlabel("Weight")
        plt.title(f"Strongest terms for component {component_num+1}")
        plt.grid(True)
        plt.savefig(f"terms_for_component_{component_num+1}")
        plt.show()
    
    return lsa_matrix, feat_names, vocab

# lsa_matrix, feat_names, lsa_vocab = lsa_transform(corp['corpora'])

## Rolling window topic modeling

In [18]:
# Create year ranges and bin the data accordingly.
# To track evolution over time, the cases will be binned
# in a rolling fashion, with overlap, to smooth out
# the effects of the groupings. 

# ------------------------

# Include left, exclude right; half-closed, half-open interval [a, b)
# cf. pandas rolling() function

def build_year_ranges(overlap, first=1785, last=2019, increment=20):
    # assert: bin cannot be underfull; overlap shouldn't exceed smallest bin size. 
    # max increment // 2 is a guideline. 
    # rewrite this as a class.
    # overlap = max(overlap, increment // 2)
    year_ranges = []
    for n in range(first, last, overlap):
        year_ranges.append((n, n + increment))
    return year_ranges

# warning: years must have the same index as data
def put_data_under_year_ranges(data, years, year_ranges):

    # assert len(data) == len(years), \
    # "get_content_under_ranges: data and years do not match length"

    # build a dict with keys = year_ranges, with a list for each range
    data_ranges = defaultdict(list)
    
    # bin all the data by range - each row should fall in two bins, 
    # if ranges are cleanly overlapped

    # if data is a list
    for i in range(len(data)):
        for y in year_ranges:
            if y[0] <= years[i] and years[i] < y[1]:
                data_ranges[y].append(data[i]) # for dictionary
                
                # this should happen twice for every entry except 
                # the very oldest and the very newest
    return data_ranges
# ------------
# main
              
# bins = build_year_ranges(first_year, last_year, increment, overlap)
# binned_data = put_data_under_year_ranges(cases, years, bins)

In [19]:
# Convert DataFrame columns to lists (warning: keep indices aligned, 
# and make sure cases are sorted by year)

corp_list = corp["corpora"].values.flatten().tolist()
year_list = corp["year"].values.flatten().tolist()
names_list = corp["case_name"].values.flatten().tolist()
id_list = corp["id_x"].values.flatten().tolist()

In [20]:
# Create a numpy array for all cases 
# in the format array[name][text]

case_dict = np.array(list(zip(names_list, corp_list)))

# Get all case texts: case_dict[:,1]
# Get all case names: case_dict[:,0]

In [21]:
# Entire corpus with ids
case_14 = np.array(list(zip(id_list, corp_list)))

In [31]:
year_ranges = build_year_ranges(overlap=5, first=1860, last=2019, increment=20)
# binned_data = put_data_under_year_ranges(corp_list, year_list, year_ranges) # dict of lists
binned_data = put_data_under_year_ranges(case_dict, year_list, year_ranges) # dict of arrays

In [32]:
years_k = list(binned_data.keys())
values_v = [len(v) for v in binned_data.values()]
num_cases = list(zip(years_k, values_v))

In [33]:
for k,v in binned_data.items():
    print(f"Number of cases decided circa {k}: {len(v)}")

Number of cases decided circa (1860, 1880): 18
Number of cases decided circa (1865, 1885): 39
Number of cases decided circa (1870, 1890): 80
Number of cases decided circa (1875, 1895): 142
Number of cases decided circa (1880, 1900): 226
Number of cases decided circa (1885, 1905): 344
Number of cases decided circa (1890, 1910): 460
Number of cases decided circa (1895, 1915): 579
Number of cases decided circa (1900, 1920): 710
Number of cases decided circa (1905, 1925): 761
Number of cases decided circa (1910, 1930): 771
Number of cases decided circa (1915, 1935): 729
Number of cases decided circa (1920, 1940): 648
Number of cases decided circa (1925, 1945): 576
Number of cases decided circa (1930, 1950): 537
Number of cases decided circa (1935, 1955): 488
Number of cases decided circa (1940, 1960): 443
Number of cases decided circa (1945, 1965): 505
Number of cases decided circa (1950, 1970): 602
Number of cases decided circa (1955, 1975): 877
Number of cases decided circa (1960, 1980):

In [34]:
# Use a dictionary comprehension to convert a list of lists
# to a numpy array for easier access to values

binned_data = {k: np.asarray(v) for k,v in binned_data.items()}

In [35]:
# with open ("binned_data.pik", "wb") as bdf:
#    dill.dump(binned_data, bdf)

In [36]:
num_df = pd.DataFrame(num_cases)
num_df.columns = ["Years", "Approx. No. Opinions Issued"]
num_df.Years = num_df.Years.apply(lambda x: (x[0]+x[1])/2)
fig = px.bar(num_df, x='Years', y='Approx. No. Opinions Issued')
fig.show()

In [None]:
# rewrite to use only binned_data keys
# need an assert statement to handle edge cases of too-small buckets
# make rolling a class

def LSA_per_bin(corpora, ngrams=(1,2), ntop=50):

    vectorizer = TfidfVectorizer(stop_words=stop_words,
                             strip_accents="unicode",
                             ngram_range=ngrams,
                             use_idf=True, 
                             smooth_idf=True)

    normalizer = Normalizer(copy=False)
    svd_model = TruncatedSVD(n_components=100,         
                         algorithm='randomized',
                         n_iter=10)

    lsa_transformer = Pipeline([('tfidf', vectorizer), 
                                ('svd', svd_model),
                                ('norm', normalizer)])
    
    # assert: first pipeline component must be tfidf/count vectorizer
    lsa_matrix = lsa_transformer.fit_transform(corpora)

    # Get the words that correspond to each of the features.
    feat_names = lsa_transformer.steps[0][1].get_feature_names()
    vocab = lsa_transformer.steps[0][1].vocabulary_

    
    ceiling = min(len(corpora), ntop)
    for component_num in range(0, ceiling):
    
        comp = lsa_transformer.steps[1][1].components_[component_num]
        
        # Sort the weights in the first component and get indices
        indices = np.argsort(comp).tolist()
        
        # Reverse order (largest weights first)
        indices.reverse()
        
        # Get top 10 terms for component        
        terms = [feat_names[weight_index] for weight_index in indices[0:ceiling]]    
        weights = [comp[weight_index] for weight_index in indices[0:ceiling]] 
        terms.reverse()
        weights.reverse()
        
        bin_terms = terms
        bin_weights = weights
        bin_matrix = lsa_matrix
        bin_feat_names = feat_names
        bin_vocab = vocab
        
        for component_num in range(0, 10): # i.e., the top 10 components.
    
        comp = lsa_transformer.steps[1][1].components_[component_num]
        
        # Sort the weights in the first component and get indices
        indices = np.argsort(comp).tolist()
        
        # Reverse order (largest weights first)
        indices.reverse()
        
        # Get top 10 terms for each component        
        terms = [feat_names[weight_index] for weight_index in indices[0:10]]    
        weights = [comp[weight_index] for weight_index in indices[0:10]]    
       
        # Display these terms and their weights as a horizontal bar graph.    
        # The horizontal bar graph displays the first item on the bottom; reverse
        # the order of the terms so the biggest one is on top.
        terms.reverse()
        weights.reverse()
        positions = np.arange(10) + .5    # Center the bar on the y axis.
        
        plt.figure(component_num)
        plt.barh(positions, weights, align="center")
        plt.yticks(positions, terms)
        plt.xlabel("Weight")
        plt.title(f"Strongest terms for component {component_num+1}")
        plt.grid(True)
        plt.savefig(f"terms_for_component_{component_num+1}")
        plt.show()
        
    return {"terms": bin_terms, "weights": bin_weights, "matrix": bin_matrix, 
            "feat_names": bin_feat_names, "vocab": bin_vocab}
        
def rolling_LSA(binned_data):    # (binned_data, year_ranges)
    
    model_ranges = defaultdict(dict)
    
    for y in tqdm_notebook(binned_data.keys()): # keys must be year ranges
        model_ranges[y] = LSA_per_bin(binned_data[y][:,1]) # [:,1] needed only if binned_data is np.array. 
                                                           # use .append() instead of = if model_ranges is dict 
                                                           # as opposed to defaultdict
        print(f"Running cases from: {y}") 
        
    return model_ranges    

In [74]:
lsa = rolling_LSA(binned_data)

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Running cases from: (1860, 1880)
Running cases from: (1870, 1890)
Running cases from: (1880, 1900)
Running cases from: (1890, 1910)
Running cases from: (1900, 1920)
Running cases from: (1910, 1930)
Running cases from: (1920, 1940)
Running cases from: (1930, 1950)
Running cases from: (1940, 1960)
Running cases from: (1950, 1970)
Running cases from: (1960, 1980)
Running cases from: (1970, 1990)
Running cases from: (1980, 2000)
Running cases from: (1990, 2010)
Running cases from: (2000, 2020)
Running cases from: (2010, 2030)



In [78]:
def show_top_terms(lsa):
    for k,v in lsa.items():
        print(f"Top terms for {k}:\n{lsa[k]['terms']}") 
          
# N.B.: Proper indexing is lsa[k][0]['terms'] if model_ranges is dict rather than defaultdict

In [173]:
show_top_terms(lsa)

Top terms for (1860, 1880):
['section', 'prerequisite', 'stock', 'slaughter houses', 'houses', 'immunity', 'cattle', 'privilege immunity', 'slavery', 'monopoly', 'animal', 'city', 'corporation', 'exclusive', 'exclusive privilege', 'citizen', 'slaughter', 'privilege']
Top terms for (1870, 1890):
['nuisance', 'gas', 'safety', 'care', 'voter', 'year', 'tax claimed', 'military code', 'qualification', 'grain', 'denies equal', 'conduct business', 'special character', 'clause fourteenth', 'treat alike', 'use steam', 'suffrage', 'railroad corporation', 'intoxicate liquor', 'legislation special', 'berry', 'berry saunders', 'injustice', 'use', 'military', 'intoxicate', 'property', 'liability impose', 'contention', 'injury subsequently', 'hardship', 'legislation', 'employment immediate', 'wrong negligence', 'kind legislation', 'immediate direction', 'instance kind', 'militia', 'election', 'member congress', 'vote', 'kansa', 'servant', 'negligence', 'hardship injustice', 'liquor', 'incompetency', 

Optimal # of clusters for whole 14th corpus?

In [None]:
from sklearn.metrics import silhouette_score
from tqdm import tqdm_notebook
distorsions = []
sil_scores = []
k_max = 25
vectorizer = TfidfVectorizer(max_df=0.95,
                                     min_df=1, ngram_range=(1,2),
                                     stop_words=stop_words,
                                     strip_accents="unicode",
                                     use_idf=True, smooth_idf=True)

#vz = vectorizer.fit_transform(corpora)

for k in tqdm_notebook(range(2, k_max)):
    kmeans_model = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=1, random_state=42,  
                         init_size=500, verbose=True, max_iter=1000)
    #kmeans_model.fit(vz)
    
    km_transformer = Pipeline([('tfidf', vectorizer), 
                            ('km', kmeans_model),
                            ('norm', normalizer)])
    
    km = km_transformer.fit_transform(case_dict[:,1])
    
    # sil_score = silhouette_score(vz, kmeans_model.labels_)
    sil_score = silhouette_score(km, km_transformer.steps[1][1].labels_)
    sil_scores.append(sil_score)
    # distorsions.append(kmeans_model.inertia_)
    distorsions.append(km_transformer.steps[1][1].inertia_)

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(15, 10))

ax1.plot(range(2, k_max), distorsions)
ax1.set_title('Distorsion vs num of clusters')
ax1.grid(True)

ax2.plot(range(2, k_max), sil_scores)
ax2.set_title('Silhouette score vs num of clusters')
ax2.grid(True)

In [29]:
def kmeans_cluster(corpora, n_clusters=8, ngrams=(1,2)):
    print(f"{len(corpora)} documents")
    
    print("Extracting features from the training dataset "
          "using a sparse vectorizer")
    t0 = time.time()
    
    vectorizer = TfidfVectorizer(max_df=0.95,
                                     min_df=1, ngram_range=ngrams,
                                     stop_words=stop_words,
                                     use_idf=True)
    km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=100,
                             init_size=500, batch_size=1000)
    X = vectorizer.fit_transform(corpora)
    
    
    print(f"done in {(time.time() - t0)}")
    print("n_samples: %d, n_features: %d" % X.shape)
    print()
    
    
    print(f"Clustering sparse data with {km}")
    t0 = time.time()
    km.fit(X)
    print(f"done in {(time.time() - t0)}")
    print()
    
    
    print(f"Silhouette Coefficient: {silhouette_score(X, km.labels_)}") # sample size=5000
    
    print()
    print(f"Top terms per cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    
    terms = vectorizer.get_feature_names()
    for i in range(n_clusters):
        print(f"Cluster {i}:")
        for ind in order_centroids[i, :10]:
            print(f'{terms[ind]}')
            print()       
    
    
    clusters = {}
    for i in range(n_clusters):
        clusters[i] = []
        for ind in order_centroids[i, :10]:
            clusters[i].append(terms[ind])
    clusters["labels"] = km.labels_
    clusters["vocab"] = vectorizer.vocabulary_
    clusters["matrix"] = X
    
    return clusters

def rolling_kmeans(binned_data, n_clusters=8, ngrams=(1,2)):    # (binned_data, year_ranges)
    
    model_ranges = defaultdict(dict)
    
    for y in tqdm_notebook(binned_data.keys()): # keys must be year ranges
        model_ranges[y] = kmeans_cluster(binned_data[y][:,1], n_clusters, ngrams)
                                                        # [:,1] needed only if binned_data is np.array. 
                                                        # use .append() instead of = if model_ranges is dict 
                                                        # as opposed to defaultdict
        print(f"Running cases from: {y}") 
        print()
    
    return model_ranges

## Run K means analysis on the corpus as a whole

In [None]:
cluster_14 = kmeans_cluster(case_14[:,1], n_clusters=10)

In [None]:
dict_to_df = cluster_14.copy() 

In [170]:
#with open("14th_am_kmeans.pik", "wb") as kfl:
#    dill.dump(dict_to_df, kfl)

In [168]:
### Export ###
df_14 = pd.DataFrame(data={"case_id":case_14[:,0],"name":case_dict[:,0],"topics": dict_to_df["labels"],"year":year_list})
bins = list(range(1869, 2040, 20))
df_14 = pd.DataFrame.from_dict(dict_to_df)
df_14["bins"] = pd.cut(x=df_14["year"],bins=bins)

# with open("bin_topic.pik", "wb") as dfp:
#    dill.dump(df_14, dfp)

### Perform K means analysis with a rolling window

In [None]:
kmb = rolling_kmeans(binned_data, n_clusters=5, ngrams=(1,2))

#### TSNE 

In [None]:
with open("14th_am_kmeans.pik", "rb") as kfl:
    cluster_14 = dill.load(kfl)
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2).fit_transform(cluster_14["matrix"].toarray())

In [None]:
with open("X_embedded.pik", "wb") as xfl:
    dill.dump(X_embedded, xfl)

In [None]:
X_plot = X_embedded.T
fig = go.Scatter(x=X_plot[0], y=X_plot[1], 
                   mode='markers', 
                   marker=dict(color=colors, 
                               colorscale=cmap,
                               showscale=False,
                               line=dict(color='black', width=1)))
py.iplot(fig)

In [75]:
with open('kmb-2.pik', 'wb') as k:
    dill.dump(kmb, k)
#with open('lsa-2.pik', 'wb') as ll:
#    dill.dump(lsa, ll)

In [None]:
#with open("kmb.pik", "rb") as rk:
#    kmb = dill.load(rk)
#with open("lsa.pik", "rb") as ll:
#    lsa = dill.load(ll)
#with open("binned_data.pik", "rb") as bdk:
#    binned_data = dill.load(bdk)

In [None]:
# get all case names 
case_names = defaultdict(list)
for k in binned_data.keys():
    case_names[k].append(binned_data[k][:,0])

In [None]:
def topic_add(kmb, data):
    topic_add = defaultdict(list)
    for k in kmb.keys():
        topic_add[k].append(list(map(list, zip(data[k], kmb[k]["labels"]))))
    return topic_add
topic_and_data = topic_add(kmb, binned_data)

In [None]:
# names only? names with corpora
topic_and_data = topic_zip(kmb, binned_data)

### Get geographic data from citations

In [None]:
import re
na_cites = df[df.citations.isna()]

USCITE = df.citations.str.findall(re.compile(r"'reporter'\: 'U\.S\.'"))
USCITE[USCITE.isna()] # df.iloc[1807]

In [None]:
df["USR"] = df.citations.str.findall(r"\'volume\'\:\s(\d{2,3})\,\s\'reporter\'\:\s\'(U\.S\.)\'\,\s\'page\'\:\s\'(\d{2,4})\'")
df["USR"].isna().any()

In [None]:
urls = df["USR"].apply(lambda x: str(x[0]) + "/" + str(x[2]))

In [None]:
df.citations.str.replace(r"((?<=\'volume\'\:\s)(\d{2,3}).*?(?<=\'reporter\'\:\s\')(U\.S\.)(?=\'\,\s\'page\'\:\s\').*?(\d{3})(?=\'\,))", 
                                                       r"\2\3\4", regex=True)

#usrep = df.citations.str.extractall(r"(\d{2,3}U\.S\.\d{2,4})")


## Object oriented / classes for rolling, frequency dictionaries

In [None]:
from inspect import Parameter
class RollingFunc:    
    
    def __init__(self, function, binned_data, **kwargs):
        self.model_ranges = defaultdict(dict)
        self.binned_data = binned_data
        self.function = function
        for k in kwargs.keys():
            if k in get_kwargs(self.function):
                # self.__setattr__(k, kwargs[k]) set FUNCTION attr
                
    def get_kwargs(function):
        """Get names of callable arguments.

        Special arguments (like ``*args`` and ``**kwargs``) are not included into
        output.

        If required_only is True, optional arguments (with default values)
        are not included into output.
        """
        sig = get_signature(function)
        function_args = list(six.iterkeys(sig.parameters))
        for param_name, p in six.iteritems(sig.parameters):
        if (p.kind in (Parameter.VAR_POSITIONAL, Parameter.VAR_KEYWORD) 
            or (required_only and p.default is not Parameter.empty)):
            function_args.remove(param_name)
        
        return function_args 
    
    def roll(self.binned_data):
        for y in tqdm_notebook(binned_data.keys()):
            model_ranges[y] = self.function(binned_data[y][:,1]) # [:,1] needed only if binned_data is np.array. 
                                                           # use .append() instead of = if model_ranges is dict 
                                                           # as opposed to defaultdict
        print(f"Running cases from: {y}") 
        
    return model_ranges    