In [None]:
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
from torch import nn,Tensor
from collections import defaultdict
import re
from transformers import RobertaForMaskedLM,RobertaTokenizer,BertSquadForQuestionAnswering,BertTokenizer,BartForConditionalGeneration
from transformers import BartTokenizer
device = 'cuda' if torch.cuda.is_available else cpu



In [29]:
# After a search at clinicaltrials.org here were the results for the drugs for intervention of HIV infections;
import string as str
candidate_drugs = ['efavirenz','tenofovir','abacavir','SP01A','Atorvastatin','Sorividine','Raltegravir']
def extract_candidates():
    for i in range(len(candidate_drugs)):
        for drugs in candidate_drugs:
            drugs = drugs.strip('')
            drugs = drugs.lower()
            return drugs

In [30]:
extract_candidates()

'efavirenz'

In [None]:
# The project is in such a way that the predictions scores are calculated on queries filtered by target tokens to score relevance
checkpoint = 'RobertaLM'
model = RobertaForMaskedLM.from_pretrained(checkpoint)
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)

In [2]:
# Roberta Scores model 
class RobertaScore(nn.Module):
    """Args :
    input-  the drugs under study 
    model - This is the pretrained model which we intend to do a search ,in this case the Roberta Model
    score,seq_score - These are predictions scores and the efficacy scores. I used softmax since it returns values between 0 and 1
    This function works in this way : A query is sent to the pretrained roberta model,using the target, prediction scores are calculated and returned as
    probability scores.The scores are then used to score our drugs which we can then use in the drug discovery 
    
    Returns :
    scores,seq scores
        """
        def __init__(self,model=model,dataset=None):
            super().__init__()
            self.outmodel = RobertaForMaskedLM.from_pretrained(model)
        def forward(self,word_mask,word_id,target_mask,target_id):
            m= nn.Softmax(dim=2)
            output_embeds = self.outmodel(word_id,token_type_ids=None,attention_mask=word_mask)
            soft_embeds = m(output_embeds[0])
            seq_score = torch.sum(soft_embeds.detach()[:,:,target_id],dim=2)
            score = torch.sum(seq_score,dim=1)
            score = score/(np.count_nonzero(word_id)*np.count_nonzero(target_id))
            return score,seq_score

NameError: name 'nn' is not defined

Extractive Summarization
Building on this idea of target query filtering on the Roberta Model ,For clarity on what we are aiming at I will use 
Text Summarization to highlight sections of the text

In [24]:
class RobertaPassageScore(nn.Module):
    # Here we will score a source phrase using the target phrase for a prediction score
    def __init__(self,model=model,dataset=None):
        super(RobertaPassageScore,self).__init__()
        self.outmodel = RobertaForMaskedLM.from_pretrained(model)
    def forward(self,word_id,word_mask,target_id,target_mask):
        output_embeds = self.outmodel(word_id,token_type_ids=None,attention_mask=word_mask)
        m = nn.Softmax(dim=2)
        soft_embeds = m(output_embeds[0])
        seq_score = torch.sum(output_embeds.detach()[:,:,target_id],dim=2)
        score = torch.sum(seq_score,dim=1)
        score = score/(np.count_nonzero(target_id)*np.count_nonzero(word_id))
        return score,seq_score
max_seq_len =128
def convert_text_to_score_input(text):
    tokenized_text = tokenizer.tokenize(text)
    # We will have to check if the lenght of the tokenized text is greater than the sequence lenght
    # We will then use the text upto the maximum sequence lenght using indexing
    if len(tokenized_text)>max_seq_len :
        tokenized_text = tokenized_text[:max_seq_len]
        # As with all pretrained LMs we have to convert all the tokenized text to ids ,then make them tensors
        token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
        token_ids_tensor = torch.tensor(token_ids)
        padding = [0]*max_seq_len-len(token_ids)
        idxes = torch.arange(0,max_seq_len,out=torch.LongTensor(max_seq_len)).unsqueeze(dim=0)
        mask = Variable((idxes<len(tokenized_text)).float())
        return token_ids_tensor.unsqueeze(0),mask
def convert_text_to_score_target(text):
    tokenized_text = tokenizer.tokenize(text)
    return tokenizer.convert_tokens_to_ids(tokenized_text)
model = RobertaPassageScore()
text = 'efavirenz is used in the treatment of'
targettext = 'efficacy'
targets = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(targettext))
targetinput,targetmask = convert_text_to_score_input(targettext)
scoreinput,mask = convert_text_to_score_input(text.split('.')[0])
score,seqscore = model(scoreinput,mask,targets,mask)

text_scores = []
for sent in text.split('.'):
    score_input = convert_text_to_score_input(sent)
    score,seqscores = model(score_input,mask,targets,mask)
    textscores.append(score)
    print(f"Highscores for {text.split('.')[3:4],textscores[3:4]}")
detections = 'efavirenz is used in the treatment of '
startends = []
for detection in detections:
    start,end = text.find(detection),text.find(detection)+len(detection)
    startends.append(start,end)
startends
ents = []
for idx in range(len(startends)):
    ent['start'] =startends['idx'][0]
    ent['end'] = startend['idx'][1]
    ent['label'] = 'efficacy'
    ents.append(ent)
# Display the Texts or passage highlighting from the Numerous articles in Roberta 
ex =[{'text':text,
     "ents":ents,
     "title":'Study Of Efavirenz'}]
spacy.displacy.render(ex,style="ent",manual=True)


NameError: name 'nn' is not defined

# FORWARD CHAINING ANALYSIS +PREDICTIVE MODEL:
USING PREDICTION SCORES TO ESTIMATE WHICH DRUGS WILL PASS THE CLINICAL TRIALS AND CREATING A PREDICTIVE MODEL


In [None]:
target = 'clinical trial efficacy'
target = convert_target_to_score_input(target)

class RobertaScore(nn.Module):
    def __init__(self,model=model,dataset = None):
        super(RobertaScore,self).__init__()
        self.outmodel = RobertaForMaskedLM.from_pretrained(model)
    def forward(self,word_mask,word_id,target_mask,target_id):
        m = nn.Softmax(dim=2)
        output_embeds = self.outmodel(word_id,token_type_ids=None,attention_mask = word_mask)
        soft_embeds = m(output_embeds[0])
        seq_score = torch.sum(soft_embeds.detach()[:,:,target_id],dim=2)
        score = torch.sum(seq_score,dim=1)
        score = score/(np.count_nonzero(target_id)*np.count_nonzero(word_id))
        return score,seq_score
model=RobertaScore()
# Creating a predictive model
candidate_drugs = ['efavirenz','tenofovir','abacavir','SP01A','Atorvastatin','Sorividine','Raltegravir']
drugresults ={}
for drug in candidate_drugs:
    for targ in targets:
        text,target = drug,targ
        score_input,mask = convert_text_to_score_input(text)
        target = convert_text_to_score_target(target)
        score,seqscore = model(score_input,mask,target,mask)
        controlresults[drug]=score


# THE END : Thank You