In [None]:
import nltk
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re #import the regular expression library
import string
import os
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
from sklearn.datasets import load_files
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
from nltk.corpus import stopwords #import the stopwords from the ntlk.corpus library
nltk.download('stopwords')
from nltk.tokenize import word_tokenize #import the word_tokenize method, which is used to turn sentences into words"
nltk.download('punkt')
from collections import Counter
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('wordnet')
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
# Download the spacy bio parser
from IPython.utils import io
with io.capture_output() as captured:
    !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz
import en_core_sci_lg  # model downloaded in previous step

In [None]:
FullPaper=[]
directories = ["biorxiv_medrxiv"]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)
        if(dirname=='/kaggle/input/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json' ):
            file_path=f"{dirname}/{filename}"
            paper = json.load(open(file_path,"rb"))
            title = paper['metadata']['title'] 
            try:
                abstract = paper['abstract']
            except:
                abstarct=""                
            full_text=""     

            for text in paper['body_text']:
                full_text +=text['text'] +'\n\n' 
            FullPaper.append([title,abstract,full_text])

FullPaperDataframe=pd.DataFrame(FullPaper,columns=['title','abstract','full_text'])

> Our aim is to address these points:
* Capabilities to discover a therapeutic (not vaccine) for the disease, and clinical effectiveness studies to discover therapeutics, to include antiviral agents.

In [None]:
full_text = FullPaperDataframe['full_text'].head(1)

clean_text = []

wnl = WordNetLemmatizer()
porter = PorterStemmer()
parser=en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000

all_stopwords = stopwords.words('english')
our_stopwords=['the','we','it','they','copyright','https','et','al','preprint',
               'this','these','also','however','although','among','in','medrxiv'
                'biorxiv','license','without','fig','figure']
all_stopwords.extend(our_stopwords)

pattern = """!"#$%&'()*+,.:;<=>?@[\]^`{|}~"""

for val in tqdm(full_text):
    body_tokens = word_tokenize(val)
    paper_body_without_stopwords =  [token for token in body_tokens if not token in all_stopwords]
    clean_string = ' '.join(paper_body_without_stopwords).lower() #convert the list into string
    clean_string = re.sub(rf"[{pattern}]", '', clean_string)
    clean_string = re.sub(r'\b[a-zA-Z]\b', '', clean_string)
    clean_string = re.sub(r'^\d+\s|\s\d+\s|\s\d+$', '', clean_string) 
    
    clean_text.append(clean_string)
    
stemmed_text=[] # a list which will contain the stemmed bodies of all docs    

for val in clean_text: #for the body (which was stripped out of stop words) of each paper
    stemmed_val="" #create an empty string
    words = val.split(' ') #split body of paper on spaces
    for word in words: #for each word in the body
        mytokens = parser(word)
        print(mytokens)
        print('diffffffffffff')
        print(word)

In [None]:
# this cell is no longer needed and is kept for reference


text = ' '.join(stemmed_text) #convert all the stemmed bodies into one string containing all the stemmed bodies of all papers 
s=text.split(' ')#split the combination of papers body on space
vocab_count = Counter(s)# count the number of vocab (unique words)
most_occur = vocab_count.most_common(200) #get the most common 200 words
#print(most_occur) #print the most common 200 words

In [None]:
#create a tf-idf matrix for the words above
v = TfidfVectorizer(sublinear_tf = True, min_df = 0.05,max_df = 0.8)
#For the above vectorizer, the first paramater specifies that the 1+log(tf) is going to be used instead of tf.
#The second parameter specifies that if a term appears in less than 5% of the docs, ignore it
#The third parameter, considers word that do not occur in more than 80% of the corpus
tfidf = v.fit_transform(stemmed_text) #fit and transform the stemmed texts which were previously cleaned.
print(sorted(v.vocabulary_.items(), key=lambda x : x[1]))
print(tfidf.toarray())

In [None]:
#Next, the search engine will be implemented with the help of the tf-idf matrix created above and the cosine similarity
query = input("Enter your query: ") 
finalQuery="" #create an empty string
words = query[0].split(' ') #split the query on spaces
print (words)
for word in words: #for each word in the query
    if(word!=""): #if the word is not an empty string
        if (word=="coronaviruses"):#since the stem and lemmatize functions do not recognize the word coronaviruses as the plural of coronavirus, a conidition was created
            word ="coronavirus"
        stem_lem_query =wnl.lemmatize(word) if wnl.lemmatize(word).endswith('e') else porter.stem(word)
        finalQuery+=stem_lem_query+" " #stem/lemmatize the word and concat it to the string stemmed_val
query = [finalQuery]
print (query)
query_tfidf = v.transform(query)
cosineSimilarities = cosine_similarity(query_tfidf, tfidf).flatten()
print(cosineSimilarities)

In [None]:
# result of first common 50 words in all the 1943 docs
#[('preprint', 42993), ('q', 29585), ('license', 20372), ('1', 18167), ('holder', 18100), ('medrxiv', 17564), ('author/funder', 17404), ('cases', 17021), ('peer-reviewed', 16697), ('https', 16223), ('data', 16221), ('covid-19', 15705), ('2', 14649), ('patients', 14258), ('doi', 14057), ('number', 13892), ('available', 13229), ('using', 11934), ('model', 11910), ('figure', 11428), ('display', 11002), ('granted', 10804), ('40', 10803), ('perpetuity', 10751), ('made', 10742), ('also', 10542), ('international', 10212), ('time', 9852), ('fig', 9747), ('3', 9574), ('used', 9344), ('cells', 9104), ('infection', 8833), ('study', 8712), ('sars-cov-2', 8633), ('r', 8195), ('5', 8153), ('this', 8143), ('virus', 8097), ('two', 7845), ('disease', 7820), ('may', 7774), ('s', 7772), ('rate', 7707), ('all', 7528), ('without', 7464), ('biorxiv', 7352), ('one', 7332), ('infected', 7203), ('10', 7201)]

In [None]:
#tf- idf
# # finding the tf-idf matrix for all the abstracts
# # vec = TfidfVectorizer()

# # ve = vec.fit_transform(clean_abs)

# # # displaying the tf-idf of the word in the abstact  
# # pd.DataFrame(ve.toarray(), columns=sorted(vec.vocabulary_.keys()))

# # #applying tf-idf cosine similarity and printing the result(testing)
# # query = ["prophylaxis"]
# # query_tfidf = vec.transform(query)
# # cosineSimilarities = cosine_similarity(query_tfidf, ve).flatten()
# # print(cosineSimilarities)

In [None]:
# taged_abs=[]
# nouns=[]
# # tagging the clean data
# for val in clean_abs:
#     taged_abs.append(pos_tag(val.split()))
# # extracting the nouns
# for i in range(len(taged_abs)):
#     doc=[]
#     for j in range(len(taged_abs[i])):
#         if(taged_abs[i][j][1]=='NN' or taged_abs[i][j][1]=='NNS'):
#             doc.append(taged_abs[i][j][0])
#     nouns.append(doc)
        
# print(nouns)



Above we dcided to calculate the tf-idf so that we can represent every word that is present in the abstract quatitavily. By doing so we can further use the results in order model the topic according to the abstract that we just quatified.Furthermore we will use (Non-negative Matrix Factorization) NMF in order to come up with topic's that carry most weight in the abstract. To accomplish this we are going to filter all the nouns that are avaliable in the abstract and use them to represent the different topis that are avaliable.