# Preparation Phase

In [1]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import xml.etree.ElementTree as ET
import os

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_summarizer():
    model_name = 'google/pegasus-xsum'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    summarizer_model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
    return tokenizer, summarizer_model

def read_xml_files():
    documents = []
    for filename in os.listdir('data'):
        if filename.endswith(".xml"):
            tree = ET.parse('data/' + filename)
            root = tree.getroot()
            for sec in root:
                if len(documents) > 1000:
                    break
                text = sec.find(".//AbstractText")
                if text != None:
                    if text.text != None:
                        if len(text.text) > 1500:
                            documents.append(text.text)
            print("finished doc" + filename)
            continue
        else:
            continue

    return documents

def read_corpus(documents, tokens_only=False):
    i = 0
    for doc in documents:
        tokens = simple_preprocess(doc)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])
        i = i+1

def most_similar(text):
    processed_query = simple_preprocess(text)
    v1 = model.infer_vector(processed_query)
    similar_doc = model.docvecs.most_similar([v1])
    print("similar_doc", similar_doc)
    return documents[similar_doc[0][0]]

def summarize(text):
    batch = tokenizer([text], truncation=True, padding='longest', return_tensors="pt").to(device)
    translated = summarizer_model.generate(**batch)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

def train_embedding(train_corpus):
    model = Doc2Vec(vector_size=124, window=20, min_count=2, epochs=200, workers=1)
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [3]:
tokenizer, summarizer_model = get_summarizer()

In [16]:
documents = read_xml_files()
train_corpus = list(read_corpus(documents))
test_corpus = list(read_corpus(documents, tokens_only=True))

finished docpubmed22n1109.xml
finished docpubmed22n1110.xml
finished docpubmed22n1111.xml
finished docpubmed22n1112.xml
finished docpubmed22n1113.xml
finished docpubmed22n1114.xml


In [17]:
model = train_embedding(train_corpus)

In [18]:
test_query_full = "Limbic-predominant age-related TDP-43 encephalopathy (LATE) is characterized by the accumulation of TAR-DNA-binding protein 43 (TDP-43) aggregates in older adults. LATE coexists with Lewy body disease (LBD) as well as other neuropathological changes including Alzheimer's disease (AD). We aimed to identify the pathological, clinical, and genetic characteristics of LATE in LBD (LATE-LBD) by comparing it with LATE in AD (LATE-AD), LATE with mixed pathology of LBD and AD (LATE-LBD+AD), and LATE alone (Pure LATE). We analyzed four cohorts of autopsy-confirmed LBD (n=313), AD (n=282), LBD+AD (n=355), and aging (n=111). We assessed the association of LATE with patient profiles including LBD subtype and AD neuropathologic change (ADNC). We studied the morphological and distributional differences between LATE-LBD and LATE-AD. By frequency analysis, we staged LATE-LBD and examined the association with cognitive impairment and genetic risk factors. Demographic analysis showed LATE associated with age in all four cohorts and the frequency of LATE was the highest in LBD+AD followed by AD, LBD, and Aging. LBD subtype and ADNC associated with LATE in LBD or AD but not in LBD+AD. Pathological analysis revealed that the hippocampal distribution of LATE was different between LATE-LBD and LATE-AD: neuronal cytoplasmic inclusions were more frequent in cornu ammonis 3 (CA3) in LATE-LBD compared to LATE-AD and abundant fine neurites composed of C-terminal truncated TDP-43 were found mainly in CA2 to subiculum in LATE-LBD, which were not as numerous in LATE-AD. Some of these fine neurites colocalized with phosphorylated α-synuclein. LATE-LBD staging showed LATE neuropathological changes spread in the dentate gyrus and brainstem earlier than in LATE-AD. The presence and prevalence of LATE in LBD associated with cognitive impairment independent of either LBD subtype or ADNC; LATE-LBD stage also associated with the genetic risk variants of TMEM106B rs1990622 and GRN rs5848. These data highlight clinicopathological and genetic features of LATE-LBD."

# print the summary
query_summarized = summarize(test_query_full)
print(query_summarized)

Limbic-dominant age-related encephalopathy coexists with Lewy disease as well as other neuropathological changes including Alzheimer's disease.


In [19]:
print("Input text:", query_summarized)
print("Most similar text:\n", most_similar(query_summarized))

Input text: Limbic-dominant age-related encephalopathy coexists with Lewy disease as well as other neuropathological changes including Alzheimer's disease.
similar_doc [(392, 0.5208674073219299), (503, 0.5182560682296753), (28, 0.47217419743537903), (568, 0.44974690675735474), (47, 0.4488297998905182), (79, 0.44599032402038574), (6, 0.44519805908203125), (42, 0.4409126043319702), (948, 0.4400466978549957), (200, 0.43958765268325806)]
Most similar text:
 Two-spotted spider mite, Tetranychus urticae Koch (Acari: Tetranychidae), is a cosmopolitan pest species that can feed on more than 1000 host plant species. Historically, organophosphate (OP) and carbamate insecticides have been used to control this extremely polyphagous pest. However, its ability to develop acaricide resistance rapidly has led to failure in control. Mutations in acetylcholinesterase gene (ace), the target-site of OP and carbamate insecticides, have been reported to be one of the major mechanisms underlying this develop

  similar_doc = model.docvecs.most_similar([v1])


# Use Case

In [20]:
query = "breast cancer treatment"

In [21]:
print("Input text:", query)
most_similar_result = most_similar(query)
print("Most similar text:\n", most_similar_result)
print("Summary of most similar text:\n", summarize(most_similar_result))

Input text: breast cancer treatment
similar_doc [(255, 0.6316928267478943), (106, 0.6054051518440247), (923, 0.5856133103370667), (978, 0.5837216377258301), (572, 0.5718222856521606), (38, 0.559840738773346), (466, 0.5579856634140015), (24, 0.5553025603294373), (399, 0.5522270798683167), (723, 0.5468785166740417)]
Most similar text:
 The aim of this study was to evaluate the efficacy of the Ovsynch protocol in the treatment of post-service subestrus in individual dairy cows compared to a single administration of PGF2α. The study was performed on 517 Polish Friesian Holstein cows with post-service anestrus over four years in 3 dairy herds under a herd health program. Cows (n=240) diagnosed ultrasonographically as non-pregnant and with a mature corpus were treated with a single PGF2α administration and inseminated at detected estrus. Cows without corpus (n=277) were treated with the Ovsynch protocol. The estrus detection rate after PGF2α administration, percentages of cows pregnant after

  similar_doc = model.docvecs.most_similar([v1])


Summary of most similar text:
 The Ovsynch protocol has been used for the treatment of post-service anestrus in dairy cows for many years.
