In [1]:
import linecache
from typing import List, Tuple
import spacy
import gensim
from gensim import corpora, models
from gensim.similarities.docsim import SparseMatrixSimilarity

In [2]:
def querytransform(query: str, nlp) -> str:
    doc = nlp(query)
    return ' '.join([token.lemma_ for token in doc if not token.is_punct])

In [3]:
def querydoc2vec(query: str, 
                 dictname: gensim.corpora.dictionary.Dictionary, 
                 model: gensim.models.tfidfmodel.TfidfModel) -> List[Tuple[int, float]]:
    return model[dictname.doc2bow(query.split())]

In [4]:
def queryresults(sims: List[Tuple[int, float]], source: str) -> None:
    print('Document', '\t', 'Score', '\t', 'Text')
    for doc, score in sims:
        print(doc, '\t\t{:.4f}'.format(score), '\t', linecache.getline(source, doc + 1)[0:200], '\n')

In [5]:
# File String Constants
TERMDICTFILE = '../models/titleabstract.dict'
TERMTFIDFMODELFILE = '../models/titleabstract.tfidf_model'
TERMTFIDFFILE = '../models/titleabstract_tfidf.mm'

In [6]:
nlp = spacy.load('en')

In [7]:
term_dict = corpora.Dictionary.load(TERMDICTFILE)

In [8]:
term_tfidf_model = models.TfidfModel.load(TERMTFIDFMODELFILE)

In [9]:
term_tfidf = corpora.MmCorpus(TERMTFIDFFILE)

In [10]:
index = SparseMatrixSimilarity(term_tfidf)

In [11]:
index.num_best = 10

In [12]:
query = 'Engine exhaust systems with secondary air injection systems A variety of embodiments of exhaust systems for engines including small off-road engines, and related methods of operation, are disclosed. In at least some embodiments, the exhaust system includes a first conduit that receives exhaust emissions from a first engine cylinder, and a second conduit that communicates air to a first port on the first conduit. The air mixes with the exhaust emissions within the first conduit so as to produce a chemical reaction, and a level of at least one undesirable component of the exhaust emissions is reduced. Further, the exhaust system does not include any catalytic converter. In some embodiments, the exhaust system further comprises a crankcase ventilation system.'

In [13]:
query_transform = querytransform(query, nlp)

In [14]:
query_transform

'engine exhaust system with secondary air injection system a variety of embodiment of exhaust system for engine include small off road engine and relate method of operation be disclose in at least some embodiment the exhaust system include a first conduit that receive exhaust emission from a first engine cylinder and a second conduit that communicate air to a first port on the first conduit the air mix with the exhaust emission within the first conduit so as to produce a chemical reaction and a level of at least one undesirable component of the exhaust emission be reduce further the exhaust system do not include any catalytic converter in some embodiment the exhaust system further comprise a crankcase ventilation system'

In [15]:
query_tfidf = querydoc2vec(query, term_dict, term_tfidf_model)

In [16]:
sims = index[query_tfidf]

In [17]:
queryresults(sims, '../intermediate/titleabstract.txt')

Document 	 Score 	 Text
185 		0.8124 	 Engine exhaust systems with secondary air injection systems A variety of embodiments of exhaust systems for engines including small off-road engines, and related methods of operation, are disclosed. I 

186 		0.8124 	 Engine exhaust systems with secondary air injection systems A variety of embodiments of exhaust systems for engines including small off-road engines, and related methods of operation, are disclosed. I 

8049 		0.5084 	 Supercharger-equipped internal combustion engine In a multicylinder internal combustion engine including a turbocharger, the turbocharger employs a twin-entry turbo where a turbine includes two exhaus 

7245 		0.5044 	 Outboard motor An outboard motor includes an engine, an exhaust passage member, and an idle exhaust passage member. The exhaust passage member defines a main exhaust passage through which exhaust gase 

24751 		0.5027 	 Dosing and mixing arrangement for use in exhaust aftertreatment A dosing and mixing a

#### Similarities from Classification Codes ####

In [18]:
# File String Constants
CLASSFILE = '../intermediate/classifications_ipc.txt'
CLASSDICTFILE = '../models/classifications.dict'
CLASSTFIDFMODELFILE = '../models/classifications.tfidf_model'
CLASSTFIDFFILE = '../models/classifications_tfidf.mm'

In [19]:
cl_dict = corpora.Dictionary.load(CLASSDICTFILE)

In [20]:
cl_tfidf_model = models.TfidfModel.load(CLASSTFIDFMODELFILE)

In [21]:
cl_tfidf = corpora.MmCorpus(CLASSTFIDFFILE)

In [22]:
cl_index = SparseMatrixSimilarity(cl_tfidf)

In [23]:
cl_index.num_best = 10

In [24]:
query = 'F02B2704 F01N310 F01N322 F01N334'

In [25]:
cl_query_tfidf = querydoc2vec(query, cl_dict, cl_tfidf_model)

In [26]:
cl_sims = cl_index[cl_query_tfidf]

In [27]:
cl_sims

[(185, 0.97865521907806396),
 (186, 0.64456218481063843),
 (24748, 0.26202374696731567),
 (24749, 0.17515073716640472),
 (13956, 0.14913399517536163),
 (24754, 0.12316439300775528),
 (24759, 0.12106891721487045),
 (191, 0.12043361365795135),
 (24760, 0.12016613036394119),
 (13954, 0.11993207782506943)]

In [28]:
cl_query_tfidf

[(539, 0.37444710185222174),
 (540, 0.5070368954931458),
 (541, 0.5294567799737209),
 (542, 0.5677838256440592)]

In [29]:
queryresults(cl_sims, '../intermediate/titleabstract.txt')

Document 	 Score 	 Text
185 		0.9787 	 Engine exhaust systems with secondary air injection systems A variety of embodiments of exhaust systems for engines including small off-road engines, and related methods of operation, are disclosed. I 

186 		0.6446 	 Engine exhaust systems with secondary air injection systems A variety of embodiments of exhaust systems for engines including small off-road engines, and related methods of operation, are disclosed. I 

24748 		0.2620 	 Exhaust gas purification apparatus for internal combustion engine An exhaust gas purification catalyst is recovered from the sulfur poisoning more appropriately. For this purpose, an exhaust gas purif 

24749 		0.1752 	 Synergistic SCR/DOC configurations for lowering diesel emissions A motor-vehicle engine system comprises a first DOC configured to receive exhaust from an engine and an SCR device coupled downstream o 

13956 		0.1491 	 System and method for determining selective catalytic reduction dosing system perfo

In [30]:
cl_tfidf[51]

[(133, 0.46349062363164567),
 (134, 0.3799624370358977),
 (135, 0.4131812416020006),
 (136, 0.2758819544602675),
 (137, 0.6276745948648087)]