In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import sys
sys.path.append('/content/drive/My Drive/APT/newest/TAR-project-master')

In [3]:
pip install num2words



In [4]:
pip install langdetect



In [5]:
pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.6/dist-packages (0.2.6.1)


In [6]:
import pandas as pd
import numpy as np
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from dataset.preprocessing.preprocessing import word_stem

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
abstracts_path = '/content/drive/My Drive/APT/temp_data/english_covid_abstracts_data.csv'
abstracts = pd.read_csv(abstracts_path)
abstracts.head()

Unnamed: 0,paper_id,section,text,preprocessed_text
0,86a998617c077f4fe2ab26214995a3548fbc0fc5,Abstract,The recent emergence of the Middle East respir...,the recent emerg middl east respiratori syndro...
1,306ef95a3a91e13a93bcc37fb2c509b67c0b5640,Abstract,Thousands of people in the United States have ...,thousand peopl unit state requir test sars-cov...
2,5e0c586f047ff909c8ed3fe171c8975a90608d08,Abstract,Background: Porcine epidemic diarrhea virus (P...,background porcin epidem diarrhea viru pedv em...
3,1579fbff7af9b156c6f49fee0526e48f852ea460,Abstract,"Currently, live-attenuated IBV vaccines are us...",current live-attenu ibv vaccin use control dis...
4,f6b29be971089bfe0916c64ab9fbddcec38a7436,Abstract,Aims: To determine analytical capabilities of ...,aim determin analyt capabl commonli use faecal...


## Queries

In [0]:
queries = [
           "incubation period of COVID-19",
           "Does smoking increase risks when having covid19?",
           "mortality rate of covid19?",
           "pregnancy complications covid19?"
]

def run_queries(query_engine, queries, k=5):
  results = query_engine.run_query(queries[0], k)
  if len(queries) > 1:
    for q in queries[1:]:
      results = pd.concat([results, query_engine.run_query(q, k)])
  return results

## TF-IDF

In [0]:
from nltk.corpus import stopwords
from query_model.query_engines import BOWQueryEngine
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [0]:
stop_words = set(stopwords.words('english'))
cv = CountVectorizer(stop_words=stop_words)
transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
query_engine = BOWQueryEngine(cv, transformer)
query_engine.fit(abstracts)

In [0]:
# If you want to load already trained model
# query_engine = BOWQueryEngine.load('/content/drive/My Drive/APT/trained_models/tf_idf.dat')

In [37]:
query_engine.save('/content/drive/My Drive/APT/trained_models/', 'tf_idf')

Writing object to /content/drive/My Drive/APT/trained_models/tf_idf.dat


In [0]:
res = run_queries(query_engine, queries)
res.reset_index(drop=True)[['query', 'text', 'sim']].style.set_properties(**{'font-size': '9pt'})

## BM25

In [0]:
from nltk.corpus import stopwords
from query_model.query_engines import BOWQueryEngine
from query_model.transformers.bm25 import BM25Transformer

In [0]:
stop_words = set(stopwords.words('english'))
cv = CountVectorizer(stop_words=stop_words)
transformer = BM25Transformer()
query_engine = BOWQueryEngine(cv, transformer)
query_engine.fit(abstracts)

In [0]:
# If you want to load already trained model
# query_engine = BOWQueryEngine.load('/content/drive/My Drive/APT/trained_models/bm25.dat')

In [23]:
query_engine.save('/content/drive/My Drive/APT/trained_models/', 'bm25')

Writing object to /content/drive/My Drive/APT/trained_models/bm25.dat


In [0]:
res = run_queries(query_engine, queries)
res.reset_index(drop=True)[['query', 'text', 'sim']].style.set_properties(**{'font-size': '9pt'})

## Word2Vec

In [0]:
from query_model.query_engines import W2VQueryEngine

In [0]:
# TODO train...
params = {
    'min_count': 5,
    'size': 300, 
    'workers': 3, 
    'window':3,
    'sg': 1
    }
query_engine = W2VQueryEngine(params)
query_engine.fit(abstracts)

In [0]:
# If you want to load already trained model
#query_engine = W2VQueryEngine.load('/content/drive/My Drive/APT/trained_models/w2v.dat')

In [17]:
query_engine.save('/content/drive/My Drive/APT/trained_models/', 'w2v')

Writing object to /content/drive/My Drive/APT/trained_models/w2v.dat


In [0]:
res = run_queries(query_engine, queries)
res.reset_index(drop=True)[['query', 'text', 'sim']].style.set_properties(**{'font-size': '9pt'})

## Doc2Vec

In [0]:
from query_model.query_engines import D2VQueryEngine

In [0]:
params = { 
    'vector_size': 300,
    'min_count': 5, 
    'negative': 5, 
    'hs': 0, 
    'sample': 1e-5,
    'epochs': 400,
    'window': 15
    }
query_engine = D2VQueryEngine(params)
query_engine.fit(abstracts)

In [0]:
# If you want to load already trained model
#query_engine = D2VQueryEngine.load('/content/drive/My Drive/APT/trained_models/d2v.dat')

In [43]:
query_engine.save('/content/drive/My Drive/APT/trained_models/', 'd2v')

Writing object to /content/drive/My Drive/APT/trained_models/d2v.dat


In [24]:
res = run_queries(query_engine, queries)
res.reset_index(drop=True)[['query', 'text', 'sim']].style.set_properties(**{'font-size': '9pt'})

100%|██████████| 1/1 [00:00<00:00, 212.97it/s]
100%|██████████| 1/1 [00:00<00:00, 221.78it/s]
100%|██████████| 1/1 [00:00<00:00, 204.20it/s]
100%|██████████| 1/1 [00:00<00:00, 204.89it/s]


Unnamed: 0,query,text,sim
0,incub period covid-19 .,"The recent emergence of the Middle East respiratory syndrome (MERS)-CoV, a close relative of the Severe Acute respiratory syndrome (SARS)-CoV, both of which caused a lethal respiratory infection in humans, reinforces the need for further understanding of coronavirus pathogenesis and the host immune response. These viruses have evolved diverse strategies to evade and block host immune responses, facilitating infection and transmission. Pathogenesis following infection with these viruses is characterized by a marked delay in the induction of Type I interferon (IFN I) and, subsequently, by a poor adaptive immune response. Therapies that expedite IFN I induction as well as interventions that antagonize immunoevasive virus proteins are thus promising candidates for immune modulation.",0.994406
1,incub period covid-19 .,"Background: Porcine epidemic diarrhea virus (PEDV) is emerging as a pathogenic coronavirus that causes a huge economic burden to the swine industry. Interaction of the viral spike (S) surface glycoprotein with the host cell receptor is recognized as the first step of infection and is the main determinant of virus tropism. The mechanisms by which neutralizing antibodies inhibit PEDV have not been defined. Isolating PEDV neutralizing antibodies are crucial to identifying the receptor-binding domains of the viral spike and elucidating the mechanism of protection against PEDV infection. Methods: B cell hybridoma technique was used to generate hybridoma cells that secrete specific antibodies. E.coli prokaryotic expression system and Bac-to-Bac expression system were used to identify the target protein of each monoclonal antibody. qPCR was performed to analyze PEDV binding to Vero E6 cells with neutralizing antibody. Results: We identified 10 monoclonal antibodies using hybridoma technology. Remarkably, 4 mAbs (designed 2G8, 2B11, 3D9, 1E3) neutralized virus infection potently, of which 2B11 and 1E3 targeted the conformational epitope of the PEDV S protein. qPCR results showed that both 2B11 and 2G8 blocked virus entry into Vero cells. Conclusion: The data suggested that PEDV neutralizing antibody inhibited virus infection by binding to infectious virions, which could work as a tool to find the receptor-binding domains.",0.980386
2,incub period covid-19 .,"Thousands of people in the United States have required testing for SARS-CoV-2. Evaluation for a special pathogen is resource intensive. We report an innovative approach to home assessment that, in collaboration with public health, enables safe evaluation and specimen collection outside the healthcare setting, avoiding unnecessary exposures and resource utilization.",0.980239
3,incub period covid-19 .,"Aims: To determine analytical capabilities of a commonly used faecal immunochemical test (FIT) to detect haemoglobin (Hb) in the context of NICE guidance DG30, and the likely use of FIT to reprioritise patients delayed by the COVID-19 pandemic. Methods: Data obtained from independent verification studies and clinical testing of the HM-JACKarc FIT method in routine primary care practice were analysed to derive analytical performance characteristics. Results: Detection capabilities for the FIT method were 0.5 µg/g (limit of blank), 1.1 (limit of detection) and 15.0 µg/g (limit of quantification). 31 of 33 (94%) non-homogenised specimens analysed in triplicate were consistently categorised relative to 10 µg/g compared to all 33 (100%) homogenised specimens. Imprecision in non-homogenised specimens was higher (median 27.8%, (range 20.5% -48.6%)) than in homogenised specimens (10.",0.978179
4,incub period covid-19 .,"The rapid outbreak of the new Coronavirus pandemic and the spread of the virus worldwide, especially in the Northern Hemisphere, have prompted various investigations about the impact of environmental factors on the rate of development of this epidemic. Different studies have called the attention to various parameters that may have influenced the spread of the virus, and in particular, the impact of climatic parameters has been emphasized. The main purpose of this study is to investigate the correlation between the average daily temperature and the rate of coronavirus epidemic growth in the infected regions. The main hypothesis object of our research is that between regions exhibiting a significant difference in the mean daily temperature, a significant difference is also observed in the average cumulative daily rate of confirmed cases, and that this does not happen if there is no significant difference in mean daily temperature. To test this research hypothesis, we carried on the case study of three regions in each of five countries and analyzed the correlation through F-test, and Independent-Samples T-Test. In all five selected countries, we found that when there is a significant difference in the daily mean temperature between two regions of a country, a significant difference exists also in the average cumulative daily rate of confirmed cases. Conversely, if there are no significant differences in the mean daily temperature of two regions in the same country, no significant difference is observed in the average cumulative daily rate of confirmed cases for these regions.",0.921884
5,doe smoke increas risk covid19 .,"and subtropical countries and is a significant public health concern and socioeconomic burden. There is an urgent need to develop antivirals that can effectively reduce dengue virus (DENV) replication and decrease viral load. Niclosamide, an antiparasitic drug approved for human use, has been recently identified as an effective antiviral agent against a number of pH-dependent viruses, including flaviviruses. Here, we reveal that neutralization of low-pH intracellular compartments by niclosamide affects multiple steps of the DENV infectious cycle. Specifically, niclosamide-induced endosomal neutralization not only prevents viral RNA replication but also affects the maturation of DENV particles, rendering them non-infectious. We found that niclosamide-induced endosomal neutralization prevented E glycoprotein conformational changes on the virion surface of flaviviruses, resulting in the release of non-infectious immature virus particles with uncleaved pr peptide from host cells. Collectively, our findings support the potential application of niclosamide as an antiviral agent against flavivirus infection and highlight a previously uncharacterized mechanism of action of the drug. Dengue virus (DENV) represents a major mosquito-borne pathogen responsible for significant public health and socioeconomic burden in large regions of tropical and subtropical countries 1,2 . There are four distinct serotypes, DENV-1 to DENV-4, that are transmitted mainly by mosquitoes of Aedes species, which continuously spread to new geographical areas around the world 3 . The World Health Organization estimates a prevalence of 50-100 million cases of DENV infection per year; however, a recent global estimate study suggested that 390 million DENV infections occur annually, of which 96 million cases have clear symptoms 2 . DENV infection causes a wide range of clinical symptoms, from acute febrile illness (dengue fever) to life-threatening haemorrhagic fever/dengue shock syndrome 1 . To date, clinically approved therapeutic options for treating DENV-infected patients are still lacking. DENV is an enveloped, single-stranded, positive-sense RNA virus that belongs to Flavivirus genus in the family Flaviviridae 4 . The genus Flavivirus comprises many important emerging arboviruses including Japanese encephalitis virus, West Nile virus and Zika virus (ZIKV). Recently, ZIKV infection has emerged as a global public health concern due to its association with newborn microcephaly 5,6 and neurological sequelae such as Guillain-Barré syndrome, meningoencephalitis, and myelitis in infected adults 6-10 . The flavivirus genome is approximately 11 kb in length and encodes a polyprotein that is processed into three structural (capsid [C], premembrane [prM], and envelope [E]) and seven non-structural proteins (NS1, NS2A, NS2B, NS3, NS4A, NS4B, www.nature.com/scientificreports www.nature.com/scientificreports/ and NS5) by cellular and viral proteases 11,12 . Flavivirus infection is initiated by attachment of the virus to a cellular receptor on the plasma membrane followed by receptor-mediated endocytosis and transportation of viral particles to endosomes 13, 14 . Viral membrane fusion with the endosomal membrane is triggered upon exposure of the virus to the low-pH environment of endosomes, through which the viral genome is released into the cytoplasm 15-19 . Following RNA replication and protein translation, immature virions containing prM proteins are assembled within the endoplasmic reticulum (ER) and mature through passaging the acidic environment of the trans-Golgi network (TGN), wherein E proteins undergo conformational changes and the pr peptides are cleaved by furin endoproteases, after which progeny virions are released from the host cell 20-24 . It is well established that neutralization of the acidic TGN environment prevents furin cleavage, resulting in immature particles containing uncleaved prM proteins [25] [26] [27] . These immature particles are non-infectious since the uncleaved prM peptides block the low-pH-induced conformational changes of the viral E proteins essential for binding to the cell surface as well as membrane fusion of the virus during entry 23, 26, 28, 29 . Thus, several studies have shown that lysosomotropic agents, such as chloroquine, exert modest antiviral effects against pH-dependent viruses, including flaviviruses, by interfering with endosomal fusion and furin-dependent maturation in vivo and in vitro 23,30-32 . Recently, niclosamide, a U.S. Food and Drug Administration (FDA)-approved antiparasitic drug used in humans 33-35 , has been identified as an effective antiviral agent against a number of pH-dependent viruses, such as human rhinoviruses and influenza virus 36 , severe acute respiratory syndrome-coronavirus 37 , Chikungunya virus 38 , and flaviviruses [39] [40] [41] . These studies suggested that the broad antiviral activity of niclosamide is associated with neutralization of endo-lysosomal pH that interferes with pH-dependent membrane fusion which is a critical step for virus entry 36 . In a recent study, Kao et al. 41 , determined the inhibitory role of endosomal deacidification in DENV viral genome replication and uncoating but not in later steps of the viral life cycle. Therefore, the possible effect of niclosamide-induced neutralization of endosomal compartments on later stages of the DENV infectious cycle remains to be elucidated. In this study, we investigated and confirmed that the neutralization of low-pH intracellular compartments by niclosamide affects multiple steps of the DENV infectious cycle. Our data indicate that niclosamide-induced endosomal neutralization prevents viral genome release and replication as well as maturation of DENV particles rendering it non-infectious. Specifically, we found that niclosamide-induced endosomal neutralization blocks conformational changes of E glycoproteins on the virion surface of both DENV and ZIKV, resulting in the release of immature virus particles with uncleaved pr peptide and preventing them from infecting new host cells. Collectively, our findings support the potential application of niclosamide as an antiviral agent against flavivirus infection and highlight a previously uncharacterized mechanism of action of the drug. Scientific RepoRts | (2019) 9:8682 | https://doi.",0.054235
6,doe smoke increas risk covid19 .,"W e are living the most important pandemic in recent world history, caused by a novel coronavirus (SARS-CoV-2), with a significant impact on the economy, public health, and mental health of the entire society. São Paulo is the epicenter of the epidemic in Brazil. Brazilian university hospitals -centers for professional training and qualification, as well as knowledge production -have a major role in combating this epidemic. Coronaviruses belong to a large family of viruses and, for 60 years, have been a known cause of respiratory infection in humans and animals. In December 2019, a novel coronavirus was identified as responsible for the flu syndrome and severe pulmonary complications, the COVID-19. Its origin, still uncertain, is probably related to a mutation in the coronavirus that infects bats, breaking the genetic barrier to adapt to a new species. The original site of transmission was a seafood and live animal market in the city of Wuhan, China. The first cases were linked to individuals who frequented this market. Later, the virus infected family members and, in geometrical progression, nearby provinces, expanding to several countries in all continents. 1,2 The virus is highly contagious through droplets and contact. Estimates indicate that an infected person can transmit the virus to two to four people. 1 The angiotensin-converting enzyme 2 (ACE2), found in the lower respiratory tract of humans, has been identified as a cellular receptor for SARS-CoV-2 and plays an important role in the pathogenesis and spread of the virus. The S-glycoprotein on the surface of the coronavirus can bind to the ACE2 receptor on the surface of cells, especially lung cells, rich in ACE2 receptors. The ribonucleic acid (RNA) of the viral genome is released inside the cell, starting the encoding of structural and accessory proteins, with subsequent release of new viruses. This process results in the release of cytokines with intense inflammatory response, leading to respiratory failure, shock, and thromboembolic phenomena related to disseminated intravascular coagulation. 3, 4 Children probably develop milder and oligosymptomatic clinical presentations because the maturity and binding ability of ACE2 might be lower in this population than in adults. This is concerning from an epidemiological perspective, since children can be important reservoirs, becoming sources of infection. 4 The incubation period is, on average, five days, ranging from two to 14 days. Most adults or children infected with SARS-CoV-2 present flu syndrome (90%) with mild symptoms, but some individuals, especially older adults and those with comorbidities, such as cardiovascular or lung diseases, diabetes, and hypertension, might progress to severe conditions: respiratory failure, multiple organ failure, and death. The fatality rate is 2 to 5%. 2 Although children can acquire the infection, they usually have a good prognosis and rarely present complications. 4 No country is prepared to face the COVID-19 epidemic, which imposes substantial negative impacts on the economy, medical care, and mental health of the society as a whole. Brazil, particularly São Paulo, which declared a state of emergency, has implemented adequate preventive measures in accordance with the epidemiological scenario. 5 Rigorous social distancing of the population is crucial, in addition to educational campaigns for hygiene and the proper use of masks. Social distancing measures should be constantly evaluated because if they are lifted before the appropriate moment, that is, before the end of community transmission, we will have a new wave and growth in cases of infection. The suspension of academic COVID-19 in university hospitals 5. Brazil -Diário Oficial de São Paulo [homepage on the Internet]. Decreto Nº 64.879 que reconhece o estado de Calamidade Pública no estado por conta da pandemia do Coronavírus (COVID-19) 21 de Março de 2020 [cited 2020 Abr 05].",0.049704
7,doe smoke increas risk covid19 .,"Background: Porcine epidemic diarrhea virus (PEDV) is emerging as a pathogenic coronavirus that causes a huge economic burden to the swine industry. Interaction of the viral spike (S) surface glycoprotein with the host cell receptor is recognized as the first step of infection and is the main determinant of virus tropism. The mechanisms by which neutralizing antibodies inhibit PEDV have not been defined. Isolating PEDV neutralizing antibodies are crucial to identifying the receptor-binding domains of the viral spike and elucidating the mechanism of protection against PEDV infection. Methods: B cell hybridoma technique was used to generate hybridoma cells that secrete specific antibodies. E.coli prokaryotic expression system and Bac-to-Bac expression system were used to identify the target protein of each monoclonal antibody. qPCR was performed to analyze PEDV binding to Vero E6 cells with neutralizing antibody. Results: We identified 10 monoclonal antibodies using hybridoma technology. Remarkably, 4 mAbs (designed 2G8, 2B11, 3D9, 1E3) neutralized virus infection potently, of which 2B11 and 1E3 targeted the conformational epitope of the PEDV S protein. qPCR results showed that both 2B11 and 2G8 blocked virus entry into Vero cells. Conclusion: The data suggested that PEDV neutralizing antibody inhibited virus infection by binding to infectious virions, which could work as a tool to find the receptor-binding domains.",0.027401
8,doe smoke increas risk covid19 .,"The recent emergence of the Middle East respiratory syndrome (MERS)-CoV, a close relative of the Severe Acute respiratory syndrome (SARS)-CoV, both of which caused a lethal respiratory infection in humans, reinforces the need for further understanding of coronavirus pathogenesis and the host immune response. These viruses have evolved diverse strategies to evade and block host immune responses, facilitating infection and transmission. Pathogenesis following infection with these viruses is characterized by a marked delay in the induction of Type I interferon (IFN I) and, subsequently, by a poor adaptive immune response. Therapies that expedite IFN I induction as well as interventions that antagonize immunoevasive virus proteins are thus promising candidates for immune modulation.",0.021294
9,mortal rate covid19 .,"The recent emergence of the Middle East respiratory syndrome (MERS)-CoV, a close relative of the Severe Acute respiratory syndrome (SARS)-CoV, both of which caused a lethal respiratory infection in humans, reinforces the need for further understanding of coronavirus pathogenesis and the host immune response. These viruses have evolved diverse strategies to evade and block host immune responses, facilitating infection and transmission. Pathogenesis following infection with these viruses is characterized by a marked delay in the induction of Type I interferon (IFN I) and, subsequently, by a poor adaptive immune response. Therapies that expedite IFN I induction as well as interventions that antagonize immunoevasive virus proteins are thus promising candidates for immune modulation.",0.988963


## BERT

In [0]:
from sentence_transformers import SentenceTransformer
from query_model.query_engines import BERTQueryEngine

In [46]:
import torch
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [47]:
query_engine = BERTQueryEngine()
query_engine.fit(abstracts)

100%|██████████| 405M/405M [00:45<00:00, 8.91MB/s]


In [0]:
# If you want to load already trained model
#query_engine = BERTQueryEngine.load('/content/drive/My Drive/APT/trained_models/BERT.dat')

In [0]:
query_engine.save('/content/drive/My Drive/APT/trained_models/', 'BERT')

Writing object to /content/drive/My Drive/APT/trained_models/BERT.dat


In [0]:
res = run_queries(query_engine, queries)
res.reset_index(drop=True)[['query', 'text', 'sim']].style.set_properties(**{'font-size': '9pt'})