### Document Similarity Via Various Methods ###

In [1]:
import linecache
from typing import List, Tuple
import spacy
import gensim
from gensim import corpora, models
from gensim.similarities.docsim import SparseMatrixSimilarity, MatrixSimilarity

#### Common Code ####

In [2]:
def querytransform(query: str, nlp) -> str:
    doc = nlp(query)
    return ' '.join([token.lemma_ for token in doc if not token.is_punct])

In [3]:
def querydoc2tfidf(query: str, 
                 dictname: gensim.corpora.dictionary.Dictionary, 
                 model: gensim.models) -> List[Tuple[int, float]]:
    return model[dictname.doc2bow(query.split())]

In [4]:
def queryresults(sims: List[Tuple[int, float]], source: str) -> None:
    print('Document', '\t', 'Score', '\t', 'Text')
    for doc, score in sims:
        print(doc, '\t\t{:.4f}'.format(score), '\t', linecache.getline(source, doc + 1)[0:500], '\n')

In [5]:
# File String Constants
TERMDICTFILE = '../models/titleabstract.dict'
TERMTFIDFMODELFILE = '../models/titleabstract.tfidf_model'
TERMTFIDFFILE = '../models/titleabstract_tfidf.mm'
TERMTFIDFSIM = '../models/tfidf_index.sim'
DOCSOURCE = '../intermediate/titleabstract.txt'

In [6]:
nlp = spacy.load('en')

In [7]:
term_dict = corpora.Dictionary.load(TERMDICTFILE)

#### Similarities from Tf-Idf Vectors ####

In [8]:
term_tfidf_model = models.TfidfModel.load(TERMTFIDFMODELFILE)

In [9]:
# Uncomment when new TF-IDF vectors are generated for corpus and similarities need to be recomputed 
#tfidf_index = SparseMatrixSimilarity(corpora.MmCorpus(TERMTFIDFFILE))
# Uncomment when new TF-IDF vectors are generated for corpus and similarities need to be saved
#tfidf_index.save('../models/tfidf_index.sim')

In [10]:
tfidf_index = SparseMatrixSimilarity.load(TERMTFIDFSIM)

In [26]:
tfidf_index.num_best = 15

In [12]:
query = 'multilayer ceramic capacitor'

In [13]:
query_transform = querytransform(query, nlp)

In [14]:
query_transform

'multilayer ceramic capacitor'

In [27]:
tfidf_sims = tfidf_index[querydoc2tfidf(query_transform, term_dict, term_tfidf_model)]

In [28]:
queryresults(tfidf_sims, DOCSOURCE)

Document 	 Score 	 Text
157897 		0.7281 	 Method of identifying direction of multilayer ceramic capacitor, apparatus identifying direction of multilayer ceramic capacitor, and method of manufacturing multilayer ceramic capacitor A method of identifying a direction of a multilayer ceramic capacitor includes the steps of transporting a plurality of multilayer ceramic capacitors in one line before each of a magnetism generator and a magnetic flux density measurement instrument, measuring a magnetic flux density with the magnetic flux densi 

235131 		0.7108 	 Multilayer ceramic capacitor having terminal electrodes and board having the same A multilayer ceramic capacitor and a board having the same are provided. The multilayer ceramic capacitor includes a ceramic body including internal electrodes and having lead-out portions exposed to end surfaces thereof, and external electrodes disposed on portions of the end surfaces of the ceramic body to be connected to the lead-out portions of the 

#### Similarities for LSI Vectors ####

In [17]:
# File String Constants
TERMLSIMODELFILE = '../models/titleabstract.lsi_model'
TERMLSIFILE = '../models/titleabstract_lsi.mm'
TERMLSISIM = '../models/lsi_index.sim'

In [18]:
term_lsi_model = models.LsiModel.load(TERMLSIMODELFILE)

In [19]:
# Uncomment when new LSI vectors are generated from corpus and similarities need to be recomputed
#term_tfidf = corpora.MmCorpus(TERMTFIDFFILE)
#lsi_index = MatrixSimilarity(term_lsi_model[term_tfidf])
# Uncomment when new LSI vectors are generated from corpus and similarities need to be saved
#lsi_index.save('../models/lsi_index.sim')

In [20]:
lsi_index = MatrixSimilarity.load(TERMLSISIM)

In [32]:
lsi_index.num_best = 15

In [33]:
lsi_sims = lsi_index[querydoc2tfidf(query_transform, term_dict, term_lsi_model)]

In [34]:
queryresults(lsi_sims, DOCSOURCE)

Document 	 Score 	 Text
190753 		0.3243 	 Removing faults from a self-healing film capacitor A method for treating a capacitor for healing faults therein includes applying a voltage between a first electrode and a second electrode of the capacitor to charge the capacitor at a start pressure; and pressurizing a non-conductive fluid surrounding the capacitor to a target pressure while the voltage is applied.
 

284322 		0.3046 	 Multilayer ceramic capacitor having a moisture resistant protective film Disclosed herein is a multilayer ceramic capacitor including a ceramic body in which internal electrodes and dielectric layers are alternately stacked; a pair of external electrodes covering both end portions of the ceramic body; and a moisture resistant protective film formed on surfaces of the dielectric layers between the pair of external electrodes and having a hydrophobic functional group.
 

92648 		0.3046 	 Multilayer ceramic capacitor having a moisture resistant protective film Discl

#### Similarities from Classification Codes ####

In [22]:
# File String Constants
CLASSFILE = '../intermediate/classifications_ipc.txt'
CLASSDICTFILE = '../models/classifications.dict'
CLASSTFIDFMODELFILE = '../models/classifications.tfidf_model'
CLASSTFIDFFILE = '../models/classifications_tfidf.mm'

In [23]:
cl_dict = corpora.Dictionary.load(CLASSDICTFILE)

In [24]:
cl_tfidf_model = models.TfidfModel.load(CLASSTFIDFMODELFILE)

In [25]:
cl_index = SparseMatrixSimilarity(corpora.MmCorpus(CLASSTFIDFFILE))

In [26]:
cl_index.num_best = 10

In [27]:
cl_query = 'F02B2704 F01N310 F01N322 F01N334'

In [28]:
cl_sims = cl_index[querydoc2tfidf(cl_query, cl_dict, cl_tfidf_model)]

In [29]:
queryresults(cl_sims, '../intermediate/titleabstract.txt')

Document 	 Score 	 Text
185 		0.9787 	 Engine exhaust systems with secondary air injection systems A variety of embodiments of exhaust systems for engines including small off-road engines, and related methods of operation, are disclosed. I 

186 		0.6446 	 Engine exhaust systems with secondary air injection systems A variety of embodiments of exhaust systems for engines including small off-road engines, and related methods of operation, are disclosed. I 

24748 		0.2620 	 Exhaust gas purification apparatus for internal combustion engine An exhaust gas purification catalyst is recovered from the sulfur poisoning more appropriately. For this purpose, an exhaust gas purif 

24749 		0.1752 	 Synergistic SCR/DOC configurations for lowering diesel emissions A motor-vehicle engine system comprises a first DOC configured to receive exhaust from an engine and an SCR device coupled downstream o 

13956 		0.1491 	 System and method for determining selective catalytic reduction dosing system perfo

In [72]:
lsi_index.num_best = 50

In [73]:
lsi_sets =[{d for d, s in s_lsi} for s_lsi in lsi_index]

In [74]:
cl_index.num_best = 50

In [75]:
cl_sets =[{d for d, s in s_cl} for s_cl in cl_index]

In [76]:
len(lsi_sets)

28808

In [77]:
sum((len(sl & sc) for sl, sc in zip(lsi_sets, cl_sets))) - len(lsi_sets)

82482

In [78]:
tfidf_index.num_best = 100

In [79]:
tfidf_sets =[{d for d, s in s_tfidf} for s_tfidf in tfidf_index]

In [80]:
sum((len(st & sc) for st, sc in zip(tfidf_sets, cl_sets))) - len(lsi_sets)

160036

In [30]:
cl_cnt_index = SparseMatrixSimilarity(corpora.MmCorpus('../models/classifications_corpus.mm'))

In [31]:
query_vec = cl_dict.doc2bow(cl_query.split())

In [32]:
query_vec

[(539, 1), (540, 1), (541, 1), (542, 1)]

In [33]:
cl_cnt_index.num_best = 10

In [34]:
cl_cnt_sims =cl_cnt_index[query_vec]

In [35]:
cl_cnt_sims

[(185, 1.0),
 (186, 0.75),
 (24749, 0.28867512941360474),
 (24759, 0.25),
 (13956, 0.25),
 (24760, 0.22360679507255554),
 (24754, 0.22360679507255554),
 (13954, 0.22360679507255554),
 (189, 0.22360679507255554),
 (8000, 0.20412415266036987)]