### Import Packages and py Files

In [5]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from tokenizer_function import spacy_tokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

### Get Patent Dataframe

In [2]:
df = pd.read_pickle("data/df.pkl")
df.head()

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_and_abstract
0,10603498,2020-03-31,Systems and methods for closed-loop determinat...,A method or system for facilitating the determ...,org_5cFCcVidnLqkMwKWc9s4,2020,utility,B2,Systems and methods for closed-loop determinat...
1,10603793,2020-03-31,Work assisting system including machine learni...,A work assisting system includes a sensor unit...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2,Work assisting system including machine learni...
2,10603797,2020-03-31,"Machine learning device, robot system, and mac...",A machine learning device for learning a motio...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2,"Machine learning device, robot system, and mac..."
3,10605228,2020-03-31,Method for controlling operation of a wind tur...,A method for controlling operation of a wind t...,org_VIvs7w0sts1aCjlrKaiG,2020,utility,B2,Method for controlling operation of a wind tur...
4,10605702,2020-03-31,Fluid analysis and monitoring using optical sp...,"Systems, methods, and computer-program product...",org_aHdfa1XsbUURjnXmlGyp,2020,utility,B2,Fluid analysis and monitoring using optical sp...


### Additional Stopwords To Improve Topic Modelling Accuracy

In [3]:
# Add terms that are too generic for this topic modelling task, like "machine, learning, model" etc
additional_stopwords = ['training', 'problem', 'use', 'model', 'machine', 'learning', 'function', 'set', 'data', 'learn',
                        'determine', 'program', 'computer', 'parameter', 'memory', 'node', 'configure', 'sequence',
                        'computing', 'datum', 'compute', 'result', 'generate', 'field', 'file', 'code', 'method', 'test',
                        'interface', 'pattern', 'user', 'employ', 'operation', 'technique', 'analysis', 'neural', 'network',
                        'invention', 'feature', 'object']

### Tokenize, Vectorize Text

In [26]:
cv = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range = (1,2), stop_words=additional_stopwords)
dtm = cv.fit_transform(df['patent_title_and_abstract'])
dtm.to_pickle("data/dtm.pkl")

In [27]:
tfidf_cv = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range = (1,2), stop_words=additional_stopwords)
tfidf_dtm = tfidf_cv.fit_transform(df['patent_title_and_abstract'])
tfidf_dtm.to_pickle("data/tfidf_dtm.pkl")

In [None]:
dtm = pd.read_pickle("data/dtm.pkl")
tfidf_dtm = pd.read_pickle("data/tfidf_dtm.pkl")

In [None]:
dense_dtm = dtm.todense()
dense_tfidf_dtm = tfidf_dtm.todense()

### DBSCAN GridSearch to Identify Best Model Params

In [28]:
# Define Search Param
search_params = {'eps': [0.2, 0.4, 0.6, 0.8], 'min_samples': [5, 15, 25, 35, 45]
                 'metric': ['euclidean', 'precomputed']}

# Init the Model
dbs = DBSCAN(n_jobs = -1)

# Init Grid Search Class
model = GridSearchCV(dbs, param_grid=search_params, verbose=2, n_jobs = -1)

# Do the Grid Search
model.fit(dtm)
#model.fit(tfidf_dtm)


Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 18.9min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LatentDirichletAllocation(batch_size=500,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='online',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=50,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_

In [29]:
# Best Model
best_dbs_model = model.best_estimator_

core_samples_mask = np.zeros_like(dbs.labels_, dtype=bool)

core_samples_mask[dbs.core_sample_indices_] = True

labels = dbs.labels_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Performance Score
print("Best Score: ", model.best_score_)

Best Model's Params:  {'n_components': 20}
Best Log Likelihood Score:  -2037467.0672747653
Model Perplexity:  20773.241424839314


In [None]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

# Plot result
import matplotlib.pyplot as plt
%matplotlib inline

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()