In [68]:
import os
import pandas as pd

import transformers
import torch
import numpy as np
from simpletransformers.language_representation import RepresentationModel
from tqdm import tqdm

import umap.umap_ as umap
import matplotlib.pyplot as plt
import sklearn.cluster as cluster

In [6]:
os.chdir('..')

In [11]:
from searcher.es_search import SearchResults_ES

In [69]:
#https://simpletransformers.ai/docs/lm-model/

#https://www.sbert.net/examples/applications/computing-embeddings/README.html

In [70]:
class EmbGenerator:
    def __init__(self, model_name, qry_obj, max_length=512, proj_method='umap', cluster_method='kmeans'):
        self.model = RepresentationModel(model_type="bert", model_name=model_name, use_cuda=False)
        self.max_length = max_length
        self.es = SearchResults_ES(qry_obj['database'], qry_obj, cleaned=False, rand=False)
        self.proj_method = proj_method
        self.cluster_method = cluster_method
        self.doc_embs = {'x' : [],
                         'y' : [],
                         'emb' : [],
                         'title' : [],
                         'dates' : [],
                         'id' : [],
                        'clusters' : []}
    
    
    def create_doc_obj(self, emb, doc_info):
        self.doc_embs['emb'].append(emb)
        self.doc_embs['title'].append(doc_info.article_title)
        self.doc_embs['id'].append(doc_info.doc_id)
        self.doc_embs['dates'].append(doc_info.date)
        return
    
    def split_sentences(self, doc):
        split_doc = doc.split()
        chunked_list = list()
        chunk_size = self.max_length
        for i in range(0, len(split_doc), chunk_size):
            chunked_list.append(" ".join(split_doc[i:i+chunk_size]))
        return(chunked_list)
    
    def generate_embs(self):
        for document in tqdm(self.es):
            dsplits = self.split_sentences(document.text)
            sentenceVectors = self.model.encode_sentences(dsplits, combine_strategy="mean")
            word_embedding_avg = np.mean(sentenceVectors, axis=0)
            self.create_doc_obj(word_embedding_avg, document)
            
    def reduce_dims(self, plot=False):
        if self.proj_method == 'umap':
            import umap.umap_ as umap
            fit = umap.UMAP(
                n_neighbors=15,
                min_dist=.1,
                n_components=2,
                metric='cosine')
            
            u = fit.fit_transform(self.doc_embs['emb'])
            self.doc_embs['x'] = u[:,0]
            self.doc_embs['y'] = u[:,1]
        
    def cluster_embs(self, num_clusters=10, reduced=False):
        if reduced:
            X = self.doc_embs['x']
        else:
            X = self.doc_embs['emb']
        
        if self.cluster_method == 'kmeans':
            self.doc_embs['clusters'] = cluster.KMeans(n_clusters=num_clusters, random_state=0).fit_predict(X)
        if type(self.cluster_method) == list:
            self.doc_embs['clusters'] == self.
                
            
        
        
        
  
test_qry_obj = {'qry': '', 'maximum_hits': '100', 'database': 'Pubmed'}
model_name = 'bert-base-uncased'
eg = EmbGenerator(model_name, test_qry_obj, max_length=100)

SyntaxError: invalid syntax (325477005.py, line 7)

In [66]:
eg.generate_embs()
eg.reduce_dims()
eg.cluster_embs

0it [00:00, ?it/s]

100
{'qry': '', 'maximum_hits': '100', 'database': 'Pubmed', 'f_start': -1, 'f_end': -1, 'min_occurrence': -1, 'max_occurrence': -1}
search query {'size': 1000, 'query': {'match_all': {}}}


88it [00:10,  8.16it/s]


In [67]:
eg.doc_embs

{'x': array([ 7.5327992,  8.204346 ,  7.991233 ,  7.810244 ,  9.252452 ,
         7.9352374,  7.91305  ,  7.2986875,  7.3508034, 10.98354  ,
        13.5979   , 12.232503 , 13.248885 , 12.201458 , 13.087315 ,
        12.771973 , 12.782145 , 11.612753 , 11.493089 , 12.800542 ,
        12.602805 ,  8.862906 , 11.661693 , 12.457413 , 11.595808 ,
        11.966427 ,  7.8126717,  7.661964 , 10.081383 ,  7.95568  ,
        10.233198 ,  9.902217 ,  9.25526  ,  8.196714 ,  9.159072 ,
         8.695905 ,  9.500008 , 13.485262 , 13.172325 , 13.518105 ,
        12.185249 , 10.193492 , 10.984757 , 10.605923 , 10.879045 ,
         7.3485203,  7.157179 ,  7.2073555,  7.327973 , 10.53998  ,
        11.901596 , 10.382877 ,  9.645207 , 13.015179 ,  8.631719 ,
         9.706102 ,  9.945259 ,  8.72353  ,  7.783472 ,  8.268013 ,
         8.79289  ,  9.029556 ,  9.297526 , 10.465733 ,  9.474714 ,
         9.281529 ,  8.236241 ,  9.795969 ,  8.207336 ,  8.272517 ,
        13.423176 , 12.782407 , 12.076548 ,

In [54]:
for d in eg.es:
    print(d)

100
{'qry': '', 'maximum_hits': '100', 'database': 'Pubmed', 'f_start': -1, 'f_end': -1, 'min_occurrence': -1, 'max_occurrence': -1}
search query {'size': 1000, 'query': {'match_all': {}}}
esDoc(doc_id='8898073', journal_title='journal', article_title='Molecular modelling of human gastric alcohol dehydrogenase (class IV) and substrate docking: differences towards the classical liver enzyme (class I).', authors=[], date='1996-10-21', text='A three-dimensional model of the human class IV alcohol dehydrogenase has been calculated based upon the X-ray structure of the class I enzyme. As judged from the model, the substrate-binding site is wider than in class I, compatible with the differences in substrate specificities and the large difference in Km value for ethanol. Substrate docking performed for the class I structure and the class IV model show all-trans-retinol and 11-cis-retinol to bind better to the class IV enzyme. The calculations also indicate that 16-hydroxyhexadecanoic acid bin