### Import Packages and py Files

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from tokenizer_function import spacy_tokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import pyLDAvis
import pyLDAvis.sklearn

import pandas as pd
import numpy as np
import requests
import json

from pandas.io.json import json_normalize
import pickle
from collections import ChainMap

  from collections import Iterable
  from collections import Mapping


### Get Patent Dataframe

In [2]:
df = pd.read_pickle("data/df.pkl")
df.head()

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_and_abstract
0,10603498,2020-03-31,Systems and methods for closed-loop determinat...,A method or system for facilitating the determ...,org_5cFCcVidnLqkMwKWc9s4,2020,utility,B2,Systems and methods for closed-loop determinat...
1,10603793,2020-03-31,Work assisting system including machine learni...,A work assisting system includes a sensor unit...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2,Work assisting system including machine learni...
2,10603797,2020-03-31,"Machine learning device, robot system, and mac...",A machine learning device for learning a motio...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2,"Machine learning device, robot system, and mac..."
3,10605228,2020-03-31,Method for controlling operation of a wind tur...,A method for controlling operation of a wind t...,org_VIvs7w0sts1aCjlrKaiG,2020,utility,B2,Method for controlling operation of a wind tur...
4,10605702,2020-03-31,Fluid analysis and monitoring using optical sp...,"Systems, methods, and computer-program product...",org_aHdfa1XsbUURjnXmlGyp,2020,utility,B2,Fluid analysis and monitoring using optical sp...


### Additional Stopwords To Improve Topic Modelling Accuracy

In [3]:
# Add terms that are too generic for this topic modelling task, like "machine, learning, model" etc
additional_stopwords = ['training', 'problem', 'use', 'model', 'machine', 'learning', 'function', 'set', 'data', 'learn',
                        'determine', 'program', 'computer', 'parameter', 'memory', 'node', 'configure', 'sequence',
                        'computing', 'datum', 'compute', 'result', 'generate', 'field', 'file', 'code', 'method', 'test',
                        'interface', 'pattern', 'user', 'employ', 'operation', 'technique', 'analysis', 'neural', 'network',
                        'invention', 'feature', 'object']

### Tokenize, Vectorize Text

In [26]:
cv = CountVectorizer(tokenizer=spacy_tokenizer, max_df=0.3, ngram_range = (1,2), stop_words=additional_stopwords)

In [27]:
dtm = cv.fit_transform(df['patent_title_and_abstract'])

### LDA GridSearch to Identify Best Model Params

In [28]:
# Define Search Param
search_params = {'n_components': [20, 25, 30, 35, 40, 45]}

# Init the Model
lda = LatentDirichletAllocation(max_iter=50, batch_size=500, learning_method='online')

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params, cv=3, verbose=2, n_jobs = -1)

# Do the Grid Search
model.fit(dtm)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 18.9min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LatentDirichletAllocation(batch_size=500,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='online',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=50,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_

In [29]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(dtm))

Best Model's Params:  {'n_components': 20}
Best Log Likelihood Score:  -2037467.0672747653
Model Perplexity:  20773.241424839314


### Fit and Transform Vectorized Ngrams with Cross Validated LDA Model

In [30]:
# Build LDA Model with GridSearch params
lda_model = LatentDirichletAllocation(n_components=20,            # Number of topics
                                      learning_decay=0.7,         
                                      max_iter=50,                # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,            # Random state
                                      batch_size=500,             # n docs in each learning iter
                                      evaluate_every = -1,        # compute perplexity every n iters, default: Don't
                                      n_jobs = -1)                # Use all available CPU

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=500, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=50,
                          mean_change_tol=0.001, n_components=20, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [31]:
lda_output = lda_model.fit_transform(dtm)

In [32]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(dtm))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(dtm))

# See model parameters
print(lda_model.get_params())

Log Likelihood:  -3472310.1731475024
Perplexity:  19753.77454451303
{'batch_size': 500, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 50, 'mean_change_tol': 0.001, 'n_components': 20, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 42, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


## Explore Topic Modeling Outputs

In [33]:
lda_model.components_

array([[0.05000196, 0.05000183, 0.05000181, ..., 0.05000165, 0.05000203,
        0.05000137],
       [1.87704519, 1.8770441 , 0.05000157, ..., 0.0500019 , 0.0500018 ,
        0.05000156],
       [0.05000215, 0.0500017 , 0.05000186, ..., 0.05000228, 0.05000158,
        0.0500018 ],
       ...,
       [0.05000182, 0.05000184, 0.05000178, ..., 0.05000218, 0.05000196,
        0.0500018 ],
       [0.05000211, 0.05000158, 0.05000199, ..., 0.05000197, 0.05000218,
        0.05000169],
       [0.05000192, 0.05000176, 0.05000183, ..., 0.05000219, 0.05000192,
        0.05000177]])

In [34]:
# Number of words/tokens in first topic
len(lda_model.components_[0])

95890

In [35]:
single_topic = lda_model.components_[0]

In [36]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([18204, 42242, 33431, ..., 35018, 74919, 24950], dtype=int64)

In [37]:
# Word least representative of this topic
single_topic[352]

0.050001941242341454

In [38]:
# Word most representative of this topic
single_topic[249]

0.050002223765839286

In [39]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([23975, 58669, 79420, 89043, 36857,  7574, 89008, 35018, 74919,
       24950], dtype=int64)

In [40]:
top_word_indices = single_topic.argsort()[-10:]

In [41]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

detect
packet
signal
traffic flow
gesture
audio
traffic
flow
robot
device


In [42]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 10 NGRAMS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 NGRAMS FOR TOPIC #0
['detect', 'packet', 'signal', 'traffic flow', 'gesture', 'audio', 'traffic', 'flow', 'robot', 'device']


THE TOP 10 NGRAMS FOR TOPIC #1
['image sensor', 'job search', 'xray', 'networking system', 'claim', 'networking', 'social networking', 'social', 'reputation', 'job']


THE TOP 10 NGRAMS FOR TOPIC #2
['radio', 'master', 'alternative', 'access', 'domain', 'knowledge', 'stroke', 'process', 'trajectory', 'task']


THE TOP 10 NGRAMS FOR TOPIC #3
['characteristic', 'representation', 'profile', 'quantity', 'reward', 'entry', 'value', 'product', 'video', 'block']


THE TOP 10 NGRAMS FOR TOPIC #4
['pixel intensity', 'intensity value', 'intensity', 'fraud', 'profile', 'pool', 'fraud detection', 'component', 'configuration', 'advertisement']


THE TOP 10 NGRAMS FOR TOPIC #5
['target', 'domain', 'state', 'summary', 'select', 'record', 'vehicle', 'content', 'word', 'vector']


THE TOP 10 NGRAMS FOR TOPIC #6
['storage', 'individual', 'flow characteristic', 'digita

## Document-Topic Matrix

In [43]:
# Create Document - Topic Matrix
lda_output = lda_model.transform(dtm)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(df))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [44]:
df_document_topic.shape

(3147, 21)

In [45]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,11,1341
1,5,225
2,6,188
3,8,154
4,17,154
5,12,134
6,9,121
7,0,109
8,14,102
9,15,97


## Interactive Visualization: Multidimensional-Scaled Topic Mapping

In [46]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, dtm, cv, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [47]:
# Show top n keywords for each topic
def show_topics(vectorizer=cv, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=cv, lda_model=lda_model, n_words=10)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,device,robot,flow,traffic,audio,gesture,traffic flow,signal,packet,detect
Topic 1,job,reputation,social,social networking,networking,claim,networking system,xray,job search,image sensor
Topic 2,task,trajectory,process,stroke,knowledge,domain,access,alternative,master,radio
Topic 3,block,video,product,value,entry,reward,quantity,profile,representation,characteristic
Topic 4,advertisement,configuration,component,fraud detection,pool,profile,fraud,intensity,intensity value,pixel intensity
Topic 5,vector,word,content,vehicle,record,select,summary,state,domain,target
Topic 6,image,flow,service,plurality,characteristic,report,digital,flow characteristic,individual,storage
Topic 7,sentence,label,incident,dependence,select message,change,message transmit,security incident,rewrite,lowlevel
Topic 8,value,plurality,energy,cluster,service,driver,hyperparameter,workload,time,event
Topic 9,document,classification,application,hierarchy,classify,label,detect,segment,provider,time


In [48]:
df_topics = df_document_topic.reset_index(drop=True)
df_topics.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.99,0.0,0.0,17
2,0.61,0.0,0.04,0.08,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.0,0.0,0
3,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.68,19


## Merging Document Topic Weights with Date, Item, & Original Text

In [49]:
df_combined = pd.merge(df, df_topics, left_index=True, right_index=True)

In [50]:
df_combined.head()

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_and_abstract,Topic0,...,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic
0,10603498,2020-03-31,Systems and methods for closed-loop determinat...,A method or system for facilitating the determ...,org_5cFCcVidnLqkMwKWc9s4,2020,utility,B2,Systems and methods for closed-loop determinat...,0.0,...,0.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11
1,10603793,2020-03-31,Work assisting system including machine learni...,A work assisting system includes a sensor unit...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2,Work assisting system including machine learni...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.99,0.0,0.0,17
2,10603797,2020-03-31,"Machine learning device, robot system, and mac...",A machine learning device for learning a motio...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2,"Machine learning device, robot system, and mac...",0.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.0,0.0,0
3,10605228,2020-03-31,Method for controlling operation of a wind tur...,A method for controlling operation of a wind t...,org_VIvs7w0sts1aCjlrKaiG,2020,utility,B2,Method for controlling operation of a wind tur...,0.0,...,0.1,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,3
4,10605702,2020-03-31,Fluid analysis and monitoring using optical sp...,"Systems, methods, and computer-program product...",org_aHdfa1XsbUURjnXmlGyp,2020,utility,B2,Fluid analysis and monitoring using optical sp...,0.0,...,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.68,19


## Manually Inspecting Topic Features & Attributing Labels

In [98]:
topic_names_list = list(df_combined.columns[4:-1])

In [155]:
topic_numbers = {}

In [148]:
df_combined.rename(columns=topic_names, inplace=True)

In [154]:
df_combined.dominant_topic.astype(int)

0        12
1        15
2        15
3         1
4        15
         ..
29223    14
29224     3
29225    10
29226     7
29227    14
Name: dominant_topic, Length: 29228, dtype: int32

In [156]:
df_combined['dominant_topic'] = [topic_numbers[number] for number in df_combined.dominant_topic]

In [159]:
df_combined.sort_values('date')

Unnamed: 0,date,file,item,body,Stock Issuance,Annaul Fin-Results,Operations Event,Annual Shareholder Meeting,Debt Event,Financial Reporting Event,...,Information Disclosure,Joint Venture and Merger Events,Conference and Presentation,Credit Agreement,Press Release Other,Dividend Event,Lease and Service Agreement,Note Issuance,Senior Executive Compensation,dominant_topic
19176,2002-04-03,0000891020,9,In accordance with General Instruction B.2 of ...,0.00,0.00,0.00,0.00,0.0,0.10,...,0.87,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,Information Disclosure
13160,2002-04-27,0001483096,8,"On April 27, 2012, Chesapeake Midstream Partne...",0.00,0.00,0.00,0.00,0.0,0.00,...,0.00,0.15,0.0,0.00,0.48,0.33,0.00,0.0,0.00,Press Release Other
28433,2002-05-14,0000950135,5,"On May 14, 2002, Brooks Automation, Inc. (“Bro...",0.00,0.00,0.00,0.00,0.0,0.00,...,0.00,0.00,0.0,0.15,0.00,0.26,0.00,0.0,0.00,Change in Operations
7977,2002-06-15,0000950133,5,This Form 8-K 12g-3/A is being filed to correc...,0.00,0.00,0.00,0.09,0.0,0.21,...,0.38,0.16,0.0,0.00,0.00,0.00,0.00,0.0,0.04,Information Disclosure
7978,2002-06-15,0000950133,5,"Effective June 15, 2002, pursuant to an Agreem...",0.36,0.00,0.00,0.08,0.0,0.00,...,0.12,0.20,0.0,0.00,0.00,0.00,0.00,0.0,0.04,Stock Issuance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12138,2020-03-12,0001558370,7,"On March 12, 2020, APLP Holdings Limited Part...",0.00,0.00,0.31,0.00,0.0,0.00,...,0.42,0.10,0.0,0.15,0.00,0.00,0.00,0.0,0.00,Information Disclosure
9867,2020-03-12,0001305323,5,"On March 12, 2020, Zovio Inc (the “Company”) t...",0.00,0.00,0.00,0.00,0.0,0.30,...,0.00,0.00,0.0,0.00,0.00,0.00,0.23,0.0,0.00,Senior Personnel Change
24440,2020-03-12,0000884144,2,"On March 12, 2020, Asure Software, Inc. (the “...",0.00,0.94,0.00,0.00,0.0,0.00,...,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,Annaul Fin-Results
16345,2020-03-13,0000003545,8,"On March 13, 2020, the Board of Directors of A...",0.00,0.00,0.00,0.00,0.0,0.00,...,0.00,0.00,0.0,0.00,0.00,0.93,0.00,0.0,0.00,Dividend Event


In [164]:
df_combined.to_pickle('df_combined.pkl')

## Reconfirming Topic Labels via Manual Text Inspection