In [2]:
import pandas as pd

# importing json into pandas dataframe structure
proj_df = pd.read_json('combined.json', lines=True)

In [27]:
# this is what our original json file looks like 
proj_df.head()

Unnamed: 0,components,contents,date,id,title,topics
0,[National Security Division (NSD)],"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01 04:00:00,,Convicted Bomb Plotter Sentenced to 30 Years,[]
1,[Environment and Natural Resources Division],WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25 04:00:00,12-919,$1 Million in Restitution Payments Announced t...,[]
2,[Environment and Natural Resources Division],BOSTON– A $1-million settlement has been...,2011-08-03 04:00:00,11-1002,$1 Million Settlement Reached for Natural Reso...,[]
3,[Environment and Natural Resources Division],WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08 05:00:00,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,[]
4,[Environment and Natural Resources Division],"The U.S. Department of Justice, the U.S. Envir...",2018-07-09 04:00:00,18-898,$100 Million Settlement Will Speed Cleanup Wor...,[Environment]


In [26]:
# there were originally 13,087 rows and 6 columns 
proj_df.shape

(13087, 6)

**We are focusing only on the last two years (2017 and 2018). So, the next step is to filter our dataset to only those 2 years**

In [23]:
# Filter by last two years
import datetime as dt

proj_df['date'] = pd.to_datetime(proj_df['date'])

df_1718 = proj_df[proj_df['date'].dt.year >= 2017] # 2208 rows, nice!
df_1718.head()

Unnamed: 0,components,contents,date,id,title,topics
4,[Environment and Natural Resources Division],"The U.S. Department of Justice, the U.S. Envir...",2018-07-09 04:00:00,18-898,$100 Million Settlement Will Speed Cleanup Wor...,[Environment]
13,"[Criminal Division, USAO - Tennessee, Middle]",A 62-count second-superseding indictment was r...,2018-03-08 05:00:00,18-285,19 Members and Associates of Tennessee Mongols...,[]
15,"[National Security Division (NSD), USAO - Cali...","Federal authorities arrested Yi-Chi Shih, 62, ...",2018-01-23 05:00:00,18-78,2 Men Charged With Conspiring to Illegally Obt...,[]
16,"[National Security Division (NSD), USAO - Cali...","Federal authorities arrested Yi-Chi Shih, 62, ...",2018-01-23 05:00:00,18-78,2 Men Charged With Conspiring to Illegally Obt...,[]
19,"[Environment and Natural Resources Division, U...",The United States Attorney’s Office for the Mi...,2017-12-14 05:00:00,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,[Environment]


**Now we have 2208 rows and 6 columns. What we wish to do next is merge the 'contents' and 'title' columns to represent the content of each 'document.' The 'contents' and/or 'topics' columns will be considered our true y labels. The 'id' column will be disregarded as we do not believe it adds any value to our analysis.**

In [53]:
# merging 'title' and 'contents' 
documents_df = df_1718["title"] + ' ' + df_1718["contents"] 

documents_df.head()

4     $100 Million Settlement Will Speed Cleanup Wor...
13    19 Members and Associates of Tennessee Mongols...
15    2 Men Charged With Conspiring to Illegally Obt...
16    2 Men Charged With Conspiring to Illegally Obt...
19    2017 Southeast Regional Animal Cruelty Prosecu...
dtype: object

In [56]:
# converting the df to a list to run topic modeling
documents_list = list(documents_df)

# confirming number of documents is 2208
len(documents_list)

2208

In [68]:
# how do we want to set max_df, min_df, etc?

from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer does tokenization and can remove terms that occur too frequently, not frequently enough, or that are stop words
# document frequency (df) means number of documents a word appears in
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') #set these based on LDA demo with news reports
tf = tf_vectorizer.fit_transform(documents_list)

In [69]:
# doesn't look like any cleaning was required
tf_vectorizer.vocabulary_

{'100': 35,
 'million': 9754,
 'settlement': 13374,
 'speed': 13847,
 'cleanup': 3287,
 'work': 15986,
 'manor': 9355,
 'superfund': 14306,
 'site': 13640,
 'north': 10332,
 'providence': 11787,
 'justice': 8374,
 'environmental': 5560,
 'protection': 11768,
 'agency': 1142,
 'epa': 5567,
 'rhode': 12695,
 'island': 8114,
 'management': 9326,
 'announced': 1462,
 'today': 14787,
 'subsidiaries': 14217,
 'stanley': 13961,
 'black': 2304,
 'decker': 4351,
 'industries': 7769,
 'agreed': 1159,
 'clean': 3282,
 'contaminated': 3763,
 'sediment': 13244,
 'soil': 13732,
 'restoration': 12594,
 'project': 11695,
 'johnston': 8295,
 'pleased': 11276,
 'reach': 12075,
 'resolution': 12559,
 'collaborative': 3396,
 'responsible': 12585,
 'parties': 10873,
 'stakeholders': 13943,
 'said': 12991,
 'acting': 964,
 'assistant': 1751,
 'general': 6639,
 'jeffrey': 8224,
 'wood': 15971,
 'environment': 5559,
 'natural': 10136,
 'resources': 12568,
 'division': 4964,
 'ends': 5463,
 'protracted': 11778

In [70]:
unique_vocabulary = list(tf_vectorizer.vocabulary_.keys())

print("The number of unique words in the vocabulary is:", len(unique_vocabulary)) #cool!

The number of unique words in the vocabulary is: 16157


**Before this next part we should get a list of the top 10 topics. Just run a counter and plot by components and/or topics. That way we have something to compare the next part to.**

In [71]:
num_topics = 10

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=94775)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=94775, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [72]:
# confirming that we have 10 topics and 16,157 words per topic
lda.components_.shape

(10, 16157)

In [75]:
# obtaining the distribution of words for each topic
topic_word_distributions = np.array([topic_word_pseudocounts / np.sum(topic_word_pseudocounts)
                                     for topic_word_pseudocounts in lda.components_])

num_top_words = 10

print('Displaying the top %d words per topic and their probabilities within the topic...' % num_top_words)
print()

for topic_idx in range(num_topics):
    print('[Topic ', topic_idx, ']', sep='')
    sort_indices = np.argsort(topic_word_distributions[topic_idx])[::-1]
    for rank in range(num_top_words):
        word_idx = sort_indices[rank]
        print(tf_vectorizer.get_feature_names()[word_idx], ':', round(topic_word_distributions[topic_idx, word_idx]*100,4),"%")
    print()

Displaying the top 10 words per topic and their probabilities within the topic...

[Topic 0]
tax : 3.4095 %
division : 1.6896 %
irs : 1.4219 %
assistant : 1.2878 %
prison : 1.1611 %
district : 1.0952 %
returns : 1.0229 %
acting : 0.9573 %
general : 0.9299 %
years : 0.8351 %

[Topic 1]
division : 1.2333 %
justice : 1.0037 %
antitrust : 0.9164 %
court : 0.8325 %
customers : 0.6883 %
federal : 0.6688 %
financial : 0.64 %
today : 0.5741 %
complaint : 0.5696 %
district : 0.5426 %

[Topic 2]
justice : 1.4477 %
civil : 1.1239 %
settlement : 1.0175 %
district : 0.9463 %
act : 0.7652 %
states : 0.7465 %
division : 0.7119 %
said : 0.7093 %
united : 0.6417 %
child : 0.6393 %

[Topic 3]
servicemembers : 3.1182 %
religious : 1.841 %
rabobank : 1.4174 %
scra : 1.4088 %
rights : 1.1485 %
odometer : 1.0182 %
vehicles : 0.9531 %
bahn : 0.8375 %
vehicle : 0.8351 %
edwards : 0.8143 %

[Topic 4]
justice : 1.3719 %
general : 1.0648 %
enforcement : 1.0625 %
law : 1.0347 %
victims : 0.7273 %
states : 0.6609 