In [1]:
import torch
from torch import nn
import time
import torchtext
import numpy as np

import random

from collections import defaultdict, Counter

import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina' 
plt.style.use('seaborn')

import pandas as pd
import spacy
import en_core_web_sm
svoc = spacy.load("en_core_web_sm")

In [2]:
datao = pd.read_pickle("~/OneDrive/kph/processed2.pkl")
datatrain = datao[datao['ext perc']>=3]
datatest = datao[datao['ext perc']<3]
# separate train and validate 
VAL_RATIO = 0.2
dtrain = datatrain.loc[:,['SRC','TRG']]
dtraink = datatrain.loc[:,['SRC','TRG','keywords','ext perc']]
seed=250
idx = np.arange(datatrain.shape[0])
np.random.seed(seed)
np.random.shuffle(idx)
val_size = int(len(idx) * VAL_RATIO)
df_train = dtrain.iloc[idx[val_size:], :]
df_val = dtrain.iloc[idx[:val_size], :]
df_val_k = dtraink.iloc[idx[:val_size], :]
df_test = datatest.loc[:,['SRC','TRG']]

In [278]:
datai = pd.read_json('/home/pding/OneDrive/kph/fullmsy.json', orient='index', convert_dates=False, convert_axes=False)

In [279]:
datai = datai[datai.abstract.notnull()]
datai = datai[datai.title.notnull()]

In [254]:
datai = datai.replace('\n',' ', regex=True)
datai = datai.replace('\t',' ', regex=True)

In [256]:
dataiu = datai.loc[datai.keywords.str.len() ==0]

In [257]:
dataik = datai.loc[datai.keywords.str.len() >0]

In [258]:
dataiu['SRC'] = dataiu.title + ' '+ dataiu.abstract

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataiu['SRC'] = dataiu.title + ' '+ dataiu.abstract


In [9]:
tokenizertrg = lambda x: x.split()
def tokenizersrc(text): # create a tokenizer function
    return [tok.text for tok in svoc.tokenizer(text)]
def read_data(df_train, datafields):
    examples = []
    words = []
    labels = []
    for pmid in df_train.index:
        words = tokenizersrc(df_train.loc[pmid,'SRC'])
        labels = tokenizertrg(df_train.loc[pmid,'TRG'])
        examples.append(torchtext.data.Example.fromlist([words, labels], datafields))
    return torchtext.data.Dataset(examples, datafields)

In [10]:
from torchcrf import CRF

class RNNCRFTagger(nn.Module):
    
    def __init__(self, text_field, label_field, emb_dim, rnn_size, update_pretrained=False):
        super().__init__()
        
        voc_size = len(text_field.vocab)
        self.n_labels = len(label_field.vocab)       
        
        self.embedding = nn.Embedding(voc_size, emb_dim)
        if text_field.vocab.vectors is not None:
            self.embedding.weight = torch.nn.Parameter(text_field.vocab.vectors, 
                                                       requires_grad=update_pretrained)

        self.rnn = nn.LSTM(input_size=emb_dim, hidden_size=rnn_size, 
                          bidirectional=True, num_layers=1)

        self.top_layer = nn.Linear(2*rnn_size, self.n_labels)
 
        self.pad_word_id = text_field.vocab.stoi[text_field.pad_token]
        self.pad_label_id = label_field.vocab.stoi[label_field.pad_token]
    
        self.crf = CRF(self.n_labels)
#        self.crf = ConditionalRandomField(self.n_labels, label_encoding="BIO",
#                                          idx2tag=text_field.vocab.itos
#                                         )
    def compute_outputs(self, sentences):
        embedded = self.embedding(sentences)
        rnn_out, _ = self.rnn(embedded)
        out = self.top_layer(rnn_out)
        return out
                
    def forward(self, sentences, labels):
        # Compute the outputs of the lower layers, which will be used as emission
        # scores for the CRF.
        scores = self.compute_outputs(sentences)
        mask0 = sentences != self.pad_word_id
        mask = mask0.byte()
        # We return the loss value. The CRF returns the log likelihood, but we return 
        # the *negative* log likelihood as the loss value.            
        # PyTorch's optimizers *minimize* the loss, while we want to *maximize* the
        # log likelihood.
        return -self.crf(scores, labels, mask=mask)
            
    def predict(self, sentences):
        # Compute the emission scores, as above.
        scores = self.compute_outputs(sentences)
        mask0 = sentences != self.pad_word_id
        mask = mask0.byte()
        # Apply the Viterbi algorithm to get the predictions. This implementation returns
        # the result as a list of lists (not a tensor), corresponding to a matrix
        # of shape (n_sentences, max_len).
        return self.crf.decode(scores, mask=mask)


In [11]:
TEXT = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, lower=False)
LABEL = torchtext.data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, unk_token=None)
fields = [('text', TEXT), ('label', LABEL)]
device = 'cuda'
use_pretrained = False
train_examples = read_data(df_train, fields)
valid_examples = read_data(df_val, fields)
# Load the pre-trained embeddings that come with the torchtext library.
if use_pretrained:
    print('We are using pre-trained word embeddings.')
    TEXT.build_vocab(train_examples, vectors=embvec)
else:  
    print('We are training word embeddings from scratch.')
    TEXT.build_vocab(train_examples, max_size=80000)
LABEL.build_vocab(train_examples)
# Create one of the models defined above.
#self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False)
model0 = RNNCRFTagger(TEXT, LABEL, emb_dim=200, rnn_size=85, update_pretrained=False)

model0.to(device)

We are training word embeddings from scratch.


RNNCRFTagger(
  (embedding): Embedding(72288, 200)
  (rnn): LSTM(200, 85, bidirectional=True)
  (top_layer): Linear(in_features=170, out_features=6, bias=True)
  (crf): CRF(num_tags=6)
)

In [12]:
model0.load_state_dict(torch.load('/home/pding/OneDrive/kph/kph/lstm1_85crf_549_wv.pt'))

<All keys matched successfully>

In [245]:
def kphext2(sentences,tags):
    kph = []
    for i in range(len(sentences)):
        s0 = svoc.tokenizer(sentences[i])
        s1 = [tok.text for tok in s0]
        t1 = tags[i]
        k1 = []
        for j in range(len(s1)):
            start = j
            if t1[j] == 'B':
                sti = 0
                stop = j+1
                while sti == 0:
                    try: 
                        kt = str(t1[stop])
                        if kt == 'I':
                            stop = stop+1
                        else:
                            k2 = str(s0[start:stop])
                            k1.append(k2)
                            sti =1
                    except(IndexError):
                        k2 = str(s0[start:stop])
                        k1.append(k2)
                        sti =1
                k2 = str(s1[j])
        kph.append(k1)
    return kph

In [102]:
device

'cuda'

In [48]:
def evaltest2(df_val, model):
    # This method applies the trained model to a list of sentences.
    examples = []
    for sen in df_val.SRC:
        words = tokenizersrc(sen)
        labels = ['O']*len(words) # placeholder
        examples.append(torchtext.data.Example.fromlist([words, labels], fields))
    dataset = torchtext.data.Dataset(examples, fields)

    iterator = torchtext.data.Iterator(
        dataset,
        device=device,
        batch_size=300,
        repeat=False,
        train=False,
        sort=False)

    # Apply the trained model to all batches.
    out = []
    model.eval()
    for batch in iterator:
        # Call the model's predict method. This returns a list of NumPy matrix
        # containing the integer-encoded tags for each sentence.
        predicted = model.predict(batch.text)

        # Convert the integer-encoded tags to tag strings.
        #for tokens, pred_sen in zip(sentences, predicted):
        for pred_sen in predicted:
            out.append([LABEL.vocab.itos[pred_id] for  pred_id in  pred_sen[1:-1]])
    return out

In [49]:
augout = evaltest2(dataiu,model0)

In [246]:
klist = kphext2(dataiu.SRC,augout)

In [260]:
for i in range(len(dataiu.index)):
    dataiu.iloc[i,2].extend(list(set(klist[i])))

In [262]:
output = pd.concat([dataik,dataiu], join="inner")

In [281]:
output.to_json('/home/pding/OneDrive/kph/MSaug.json', orient='index')

In [283]:
output.to_json('/home/pding/OneDrive/kph/MSaug2.json', orient='records')

In [282]:
output.to_pickle("/home/pding/OneDrive/kph/msout.pkl")

In [176]:
def safe_value(field_val):
    return field_val if not pd.isna(field_val) else "Other"

In [204]:
def safe_year(field_val):
    return field_val if not pd.isna(field_val) else 1900

In [263]:
output['journal'] = output['journal'].apply(safe_value)
output['conclusions'] = output['conclusions'].apply(safe_value)
output['pubdate'] = output['pubdate'].apply(safe_year)

In [264]:
output['PMID'] = output.index

In [266]:
output.isna().sum()

title          0
abstract       0
keywords       0
authors        0
journal        0
pubdate        0
conclusions    0
PMID           0
dtype: int64

In [166]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [201]:
test_server = [{'host':'127.0.0.1','port':9200}]

In [202]:
es = Elasticsearch(test_server,http_compress=True)

In [192]:
use_these_keys = ['PMID', 'title', 'abstract', 'keywords','authors','pubdate']
def filterKeys(document):
    return {key: document[key] for key in use_these_keys }

In [272]:
def doc_generator(df):
    df_iter = df.iterrows()
    for index, document in df_iter:
        try:
            yield {
                "_index": 'ms',
                "_source": filterKeys(document),
            }
        except StopIteration:
            return

In [273]:
helpers.bulk(es, doc_generator(output))

(72527, [])

In [274]:
output

Unnamed: 0,title,abstract,keywords,authors,journal,pubdate,conclusions,PMID
32544864,Evolution of tongue somatosensory evoked poten...,The aim of the present study was to investigat...,"[Evoked potentials, Multiple sclerosis, Tongue...","[{'lastname': 'Krbot Skorić', 'firstname': 'Ma...",Multiple sclerosis and related disorders,2020.0,This study demonstrates a significant deterior...,32544864
32544855,Baseline cerebral metabolism predicts fatigue ...,"Cerebral metabolic rate of oxygen (CMRO2), a m...","[Cerebral blood flow, Cognition, Energy metabo...","[{'lastname': 'West', 'firstname': 'Kl', 'init...",NeuroImage. Clinical,2020.0,"Altogether, these results suggest that increas...",32544855
32544469,Changes of immune parameters of T lymphocytes ...,Multiple sclerosis (MS) is an autoimmune disea...,"[Bone marrow mesenchymal stem cells, Experimen...","[{'lastname': 'Xin', 'firstname': 'Ying', 'ini...",Immunology letters,2020.0,Other,32544469
32543727,Epstein Barr virus-immortalized B lymphocytes ...,Multiple sclerosis (MS) is the most common aut...,"[Epstein Barr virus, experimental autoimmune e...","[{'lastname': 'Polepole', 'firstname': 'Pascal...",Journal of medical virology,2020.0,Other,32543727
32543241,An evaluation of dimethyl fumarate for the tre...,In recent years there has been a dramatic rise...,"[Dimethyl fumarate, disease modifying therapie...","[{'lastname': 'Valencia-Sanchez', 'firstname':...",Expert opinion on pharmacotherapy,2020.0,Other,32543241
...,...,...,...,...,...,...,...,...
4952377,Sclerotic lesions of bone in myeloma.,Osteolytic defects and osteoporosis are common...,"[myelofibrosis-myelosclerosis syndrome, myelom...","[{'lastname': 'Langley', 'firstname': 'G R', '...",Canadian Medical Association journal,1966.0,Other,4952377
5881652,The effect of diet on the fatty acid compositi...,1. Three groups of female rats (8-12 weeks old...,"[multiple sclerosis, myelin, fatty acid, oleic...","[{'lastname': 'Rathbone', 'firstname': 'L', 'i...",The Biochemical journal,1965.0,Other,5881652
5844205,Alpha-amino nitrogen in cerebrospinal fluid an...,The concentration of alpha-amino nitrogen (a m...,"[multiple sclerosis, total amino-acids, childr...","[{'lastname': 'Williams', 'firstname': 'E M', ...",Journal of clinical pathology,1965.0,Other,5844205
5835446,Lipid studies in the blood and brain in multip...,"The lipid patterns of plasma, red blood cells,...","[multiple sclerosis, thin-layer, oleic acids, ...","[{'lastname': 'Cumings', 'firstname': 'J N', '...",Journal of clinical pathology,1965.0,Other,5835446
