### Import Packages and py Files

In [31]:
from __future__ import absolute_import, division, print_function, unicode_literals

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from tokenizer_function import spacy_tokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import pyLDAvis
import pyLDAvis.sklearn

import pandas as pd
import numpy as np
import requests
import json

from pandas.io.json import json_normalize
import pickle
from collections import ChainMap

### Get Patent Dataframe

In [2]:
df = pd.read_pickle("data/df.pkl")
df.head()

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_and_abstract
0,10603498,2020-03-31,Systems and methods for closed-loop determinat...,A method or system for facilitating the determ...,org_5cFCcVidnLqkMwKWc9s4,2020,utility,B2,Systems and methods for closed-loop determinat...
1,10603793,2020-03-31,Work assisting system including machine learni...,A work assisting system includes a sensor unit...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2,Work assisting system including machine learni...
2,10603797,2020-03-31,"Machine learning device, robot system, and mac...",A machine learning device for learning a motio...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2,"Machine learning device, robot system, and mac..."
3,10605228,2020-03-31,Method for controlling operation of a wind tur...,A method for controlling operation of a wind t...,org_VIvs7w0sts1aCjlrKaiG,2020,utility,B2,Method for controlling operation of a wind tur...
4,10605702,2020-03-31,Fluid analysis and monitoring using optical sp...,"Systems, methods, and computer-program product...",org_aHdfa1XsbUURjnXmlGyp,2020,utility,B2,Fluid analysis and monitoring using optical sp...


### Tokenize, Vectorize Text

In [3]:
cv = CountVectorizer(tokenizer=spacy_tokenizer, max_df=0.7, min_df=0.02, ngram_range = (1,1), stop_words='english')

In [4]:
dtm = cv.fit_transform(df['patent_title_and_abstract'])

### LDA GridSearch to Identify Best Model Params

In [7]:
# Define Search Param
search_params = {'n_components': [5, 10, 15, 20, 25, 30],'learning_decay': [.3, .5, .7]}

# Init the Model
lda = LatentDirichletAllocation(max_iter=50, batch_size=500, learning_method='online')

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params, cv=3, verbose=2, n_jobs = -1)

# Do the Grid Search
model.fit(dtm)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed: 25.3min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LatentDirichletAllocation(batch_size=500,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='online',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=50,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_

In [8]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(dtm))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 20}
Best Log Likelihood Score:  -292667.86477230635
Model Perplexity:  194.86776187864803


### Fit and Transform Vectorized Ngrams with Cross Validated LDA Model

In [9]:
# Build LDA Model with GridSearch params
lda_model = LatentDirichletAllocation(n_components=20,            # Number of topics
                                      learning_decay=0.5,         
                                      max_iter=50,                # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,            # Random state
                                      batch_size=500,             # n docs in each learning iter
                                      evaluate_every = -1,        # compute perplexity every n iters, default: Don't
                                      n_jobs = -1)                # Use all available CPU

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=500, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.5,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=50,
                          mean_change_tol=0.001, n_components=20, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [10]:
lda_output = lda_model.fit_transform(dtm)

In [11]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(dtm))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(dtm))

# See model parameters
print(lda_model.get_params())

Log Likelihood:  -837942.7257685919
Perplexity:  194.92215483566403
{'batch_size': 500, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.5, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 50, 'mean_change_tol': 0.001, 'n_components': 20, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 42, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


## Explore Topic Modeling Outputs

In [12]:
lda_model.components_

array([[5.00000006e-02, 8.55271582e+00, 5.00000026e-02, ...,
        5.00000008e-02, 5.00000011e-02, 5.00000005e-02],
       [5.00788301e-02, 3.42812520e+01, 6.17153830e+00, ...,
        5.00000002e-02, 1.38086216e+02, 5.00000001e-02],
       [4.23359265e+01, 5.00000011e-02, 1.33980879e+01, ...,
        1.07374758e+02, 4.38622462e+01, 5.00000034e-02],
       ...,
       [5.00000011e-02, 5.00001076e-02, 5.00000013e-02, ...,
        5.00000005e-02, 5.00000007e-02, 5.00000002e-02],
       [5.00000011e-02, 4.42494839e-01, 5.00000011e-02, ...,
        5.00000007e-02, 5.00000008e-02, 5.00000002e-02],
       [8.43268390e+00, 4.10313932e+01, 5.00000005e-02, ...,
        5.00000005e-02, 1.14252622e+01, 5.00000003e-02]])

In [13]:
# Number of words/tokens in first topic
len(lda_model.components_[0])

433

In [14]:
single_topic = lda_model.components_[0]

In [15]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([352, 294, 236,  62, 194, 393, 286, 134, 271, 262, 288, 248, 367,
        27, 421, 345, 160, 182,  68, 270, 115,  18, 181, 112, 113, 128,
       313,  73, 386, 136, 320, 205, 240, 114,  44, 196, 398, 146,  70,
       373,  86, 429, 361, 310, 341,  74, 149, 176, 324, 165, 280, 427,
       226, 428, 253, 155, 197, 192, 300,  40, 158, 140,  29, 199, 231,
       127, 235,  43,  26, 183, 358, 175, 346, 187,  57, 303, 308,  88,
        39, 243, 424, 124, 409, 111, 171, 219, 261, 267, 275,  89, 351,
         7, 259, 396, 317, 377, 292,  97, 376,  15, 283,  80, 218, 241,
       117, 143,   4, 153, 269,  96, 103, 378, 336,  92, 318,   5, 100,
       123, 363,  56,  81, 420,  93, 276, 411,  72, 432, 330,  42,  45,
        63, 233, 201,  20, 338, 379, 162,  21,  55, 263, 130,   0, 395,
        65, 314, 159, 118, 195, 138, 381, 347, 141, 244, 120, 312, 122,
       375, 357,   6, 385, 290, 191, 391, 340,  85, 142, 392, 121, 304,
       227, 210, 407, 368, 133, 148, 311, 355, 132, 225, 277, 17

In [18]:
# Word least representative of this topic
single_topic[352]

0.05000000005160503

In [19]:
# Word most representative of this topic
single_topic[249]

1334.8905824366063

In [20]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([403, 305,  54, 399, 152,  53,  52, 125, 413, 249], dtype=int64)

In [21]:
top_word_indices = single_topic.argsort()[-10:]

In [22]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

training
problem
classify
train
extraction
classifier
classification
document
use
model


In [23]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 10 NGRAMS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 NGRAMS FOR TOPIC #0
['training', 'problem', 'classify', 'train', 'extraction', 'classifier', 'classification', 'document', 'use', 'model']


THE TOP 10 NGRAMS FOR TOPIC #1
['use', 'label', 'model', 'learn', 'training datum', 'function', 'set', 'data', 'training', 'datum']


THE TOP 10 NGRAMS FOR TOPIC #2
['determine', 'category', 'score', 'computer program', 'level', 'product', 'entity', 'content', 'program', 'computer']


THE TOP 10 NGRAMS FOR TOPIC #3
['route', 'individual', 'flow', 'base', 'request', 'estimate', 'service', 'characteristic', 'parameter', 'model']


THE TOP 10 NGRAMS FOR TOPIC #4
['learn', 'identify', 'sensor', 'memory', 'node', 'configure', 'sequence', 'communication', 'processor', 'video']


THE TOP 10 NGRAMS FOR TOPIC #5
['receive', 'mobile', 'determine', 'compute device', 'computing', 'datum', 'compute', 'location', 'item', 'device']


THE TOP 10 NGRAMS FOR TOPIC #6
['field', 'learn', 'metric', 'score', 'quality', 'result', 'generate', 'candidate', 'use

## Document-Topic Matrix

In [25]:
# Create Document - Topic Matrix
lda_output = lda_model.transform(dtm)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(df))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [26]:
df_document_topic.shape

(3147, 21)

In [27]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,10,328
1,18,299
2,17,276
3,1,256
4,0,197
5,11,194
6,13,188
7,16,188
8,19,185
9,12,183


## Interactive Visualization: Multidimensional-Scaled Topic Mapping

In [32]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, dtm, cv, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [93]:
# Show top n keywords for each topic
def show_topics(vectorizer=cv, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=cv, lda_model=lda_model, n_words=10)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,officer,executive,president,chief,company,executive officer,vice,vice president,effective,president chief
Topic 1,press release,press,release,exhibit,attach,attach exhibit,issue,hereto,financial,incorporated
Topic 2,form,report,report form,file,current report,current,company,information,statement,exhibit
Topic 3,meeting,share,stockholder,vote,company,shareholder,stock,proxy,common stock,hold
Topic 4,form,information current,pursuant,item,furnish,pursuant item,information,report,regulation,current
Topic 5,net,release,sale,este,relate,share,este company,include,earning,estimate
Topic 6,result,release,press,press release,conference,company,presentation,copy,exhibit,item
Topic 7,note,merger,company,agreement,principal,senior,aggregate,aggregate principal,date,corporation
Topic 8,subsidiary,report,agreement,file,amendment,plan,previously,collectively,date,amend
Topic 9,agreement,partnership,enter,plan,company,energy,transaction,partner,term,purchase


In [95]:
df_topics = df_document_topic.reset_index(drop=True)
df_topics.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,dominant_topic
0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.64,0.0,13
1,0.0,0.0,0.0,0.08,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.68,14
2,0.0,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0.55,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0
4,0.01,0.01,0.01,0.01,0.01,0.08,0.01,0.01,0.01,0.77,0.08,0.01,0.01,0.01,0.01,9


## Merging Document Topic Weights with Date, Item, & Original Text

In [96]:
df_combined = pd.merge(df_A, df_topics, left_index=True, right_index=True)

In [97]:
df_combined.head()

Unnamed: 0,date,file,item,body,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,dominant_topic
0,2010-05-25,909518,8,"On May\n25, 2010, The Estée Lauder Companies I...",0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.64,0.0,13
1,2010-11-09,909518,5,"(e) On November 9, 2010, the stockho...",0.0,0.0,0.0,0.08,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.68,14
2,2010-11-09,909518,5,We held our Annual Meeting of Stockholders on ...,0.0,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,2010-11-09,909518,5,"On February 9, 2011, The Estée Lauder Companie...",0.55,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0
4,2011-04-05,909518,1,"On April6, 2011, Estee Lauder Inc. (“ELI”), a ...",0.01,0.01,0.01,0.01,0.01,0.08,0.01,0.01,0.01,0.77,0.08,0.01,0.01,0.01,0.01,9


## Manually Inspecting Topic Features & Attributing Labels

In [98]:
topic_names_list = list(df_combined.columns[4:-1])

In [155]:
topic_numbers = {0:'Earnings Press Release', 1:'Annual Fin-Results', 2:'Quarterly Fin-Results', 3:'Annual Shareholder Meeting', 4:'Debt Event', 5:'Financial Reporting Event',
                 6:'Board Member Appointment', 7:'Quarterly Fin-Results', 8:'Change in Operations', 9:'Sale of Assets', 10:'Senior Personnel Change', 
                 11:'Information Disclosure', 12:'Joint Venture and Merger Events', 13:'Conference and Presentation', 14:'Credit Agreement',
                 15:'Press Release Other', 16:'Dividend Event', 17:'Lease and Service Agreement', 18:'Note Issuance', 19:'Senior Executive Compensation'}

In [148]:
df_combined.rename(columns=topic_names, inplace=True)

In [154]:
df_combined.dominant_topic.astype(int)

0        12
1        15
2        15
3         1
4        15
         ..
29223    14
29224     3
29225    10
29226     7
29227    14
Name: dominant_topic, Length: 29228, dtype: int32

In [156]:
df_combined['dominant_topic'] = [topic_numbers[number] for number in df_combined.dominant_topic]

In [159]:
df_combined.sort_values('date')

Unnamed: 0,date,file,item,body,Stock Issuance,Annaul Fin-Results,Operations Event,Annual Shareholder Meeting,Debt Event,Financial Reporting Event,...,Information Disclosure,Joint Venture and Merger Events,Conference and Presentation,Credit Agreement,Press Release Other,Dividend Event,Lease and Service Agreement,Note Issuance,Senior Executive Compensation,dominant_topic
19176,2002-04-03,0000891020,9,In accordance with General Instruction B.2 of ...,0.00,0.00,0.00,0.00,0.0,0.10,...,0.87,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,Information Disclosure
13160,2002-04-27,0001483096,8,"On April 27, 2012, Chesapeake Midstream Partne...",0.00,0.00,0.00,0.00,0.0,0.00,...,0.00,0.15,0.0,0.00,0.48,0.33,0.00,0.0,0.00,Press Release Other
28433,2002-05-14,0000950135,5,"On May 14, 2002, Brooks Automation, Inc. (“Bro...",0.00,0.00,0.00,0.00,0.0,0.00,...,0.00,0.00,0.0,0.15,0.00,0.26,0.00,0.0,0.00,Change in Operations
7977,2002-06-15,0000950133,5,This Form 8-K 12g-3/A is being filed to correc...,0.00,0.00,0.00,0.09,0.0,0.21,...,0.38,0.16,0.0,0.00,0.00,0.00,0.00,0.0,0.04,Information Disclosure
7978,2002-06-15,0000950133,5,"Effective June 15, 2002, pursuant to an Agreem...",0.36,0.00,0.00,0.08,0.0,0.00,...,0.12,0.20,0.0,0.00,0.00,0.00,0.00,0.0,0.04,Stock Issuance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12138,2020-03-12,0001558370,7,"On March 12, 2020, APLP Holdings Limited Part...",0.00,0.00,0.31,0.00,0.0,0.00,...,0.42,0.10,0.0,0.15,0.00,0.00,0.00,0.0,0.00,Information Disclosure
9867,2020-03-12,0001305323,5,"On March 12, 2020, Zovio Inc (the “Company”) t...",0.00,0.00,0.00,0.00,0.0,0.30,...,0.00,0.00,0.0,0.00,0.00,0.00,0.23,0.0,0.00,Senior Personnel Change
24440,2020-03-12,0000884144,2,"On March 12, 2020, Asure Software, Inc. (the “...",0.00,0.94,0.00,0.00,0.0,0.00,...,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,Annaul Fin-Results
16345,2020-03-13,0000003545,8,"On March 13, 2020, the Board of Directors of A...",0.00,0.00,0.00,0.00,0.0,0.00,...,0.00,0.00,0.0,0.00,0.00,0.93,0.00,0.0,0.00,Dividend Event


In [164]:
df_combined.to_pickle('df_combined.pkl')

## Reconfirming Topic Labels via Manual Text Inspection