In [1]:
import nltk
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re #import the regular expression library
import string
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
from sklearn.datasets import load_files
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
from nltk.corpus import stopwords #import the stopwords from the ntlk.corpus library
nltk.download('stopwords')
from nltk.tokenize import word_tokenize #import the word_tokenize method, which is used to turn sentences into words"
nltk.download('punkt')
from collections import Counter
from nltk.stem import PorterStemmer 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
FullPaper=[]
directories = ["biorxiv_medrxiv"]
for directory in directories: #for each of the three folders carrying the json format of different research papers
    for file in tqdm(os.listdir(f"{directory}/{directory}/pdf_json")): #for every json file
        file_path=f"{directory}/{directory}/pdf_json/{file}" #set the file path to the file_path variable 
        paper = json.load(open(file_path,"rb"))
        title = paper['metadata']['title'] 
        try:
            abstract = paper['abstract']
        except:
            abstarct=""                
        full_text=""     
        
        for text in paper['body_text']:
            full_text +=text['text'] +'\n\n' 
        FullPaper.append([title,abstract,full_text])
        
FullPaperDataframe=pd.DataFrame(FullPaper,columns=['title','abstract','full_text'])

100%|██████████| 1934/1934 [00:04<00:00, 399.18it/s]


> Our aim is to address these points:
* Capabilities to discover a therapeutic (not vaccine) for the disease, and clinical effectiveness studies to discover therapeutics, to include antiviral agents.

In [3]:
full_text = FullPaperDataframe['full_text'].head(10)
clean_text = [] #a list which will hold all the bodies of the papers after being stripped out of stopwords
ps=PorterStemmer() 
# punctuation_regex = string.punctuation
# punctuation_regex = punctuation_regex.replace("-", "") # keep the hyphens
# pattern = r"[{}]".format(punctuation_regex) # generate the regex pattern
pattern = """!"#$%&'()*+,.:;<=>?@[\]^_`{|}~""" #the pattern which will account for punctuation, and will later be used to remove them
#cleaning the data and removing stop words
for val in  tqdm(full_text):
        body_tokens = word_tokenize(val)
        paper_body_without_stopwords = [token for token in body_tokens if not token in stopwords.words('english')] #remove the stop words in the body and return a list
        clean_string = ' '.join(paper_body_without_stopwords).lower() #convert the list into string
        clean_string = re.sub(rf"[{pattern}]", '', clean_string)#remove punctuation except for hyphens
        clean_string = re.sub(r"\bthe\b", r"", clean_string) #remove the
        clean_string = re.sub(r'\bwe\b', '',clean_string ) #remove the pronoun we
        clean_string = re.sub(r'\bit\b', '',clean_string )#remove the pronoun it
        clean_string = re.sub(r'\bthey\b', '',clean_string )#remove the pronoun they
        clean_string = re.sub(r'\bcopyright\b', '',clean_string )#remove the word copyright
        clean_string = re.sub(r'\bhttps\b', '',clean_string )#remove the word https
        clean_string = re.sub(r'\bet\b', '',clean_string )#remove the word et
        clean_string = re.sub(r'\bal\b', '',clean_string )#remove the word al
        clean_string = re.sub(r'\bpreprint\b', '',clean_string )#remove the word preprint
        clean_string = re.sub(r'\bthis\b', '',clean_string )#remove the word this
        clean_string = re.sub(r'\bthese\b', '',clean_string )#remove the word these
        clean_string = re.sub(r'\balso\b', '',clean_string )#remove the word also
        clean_string = re.sub(r'\bin\b', '',clean_string )#remove the preposition in
        clean_string = re.sub(r'\bwithout\b', '',clean_string )#remove the word without
        clean_string = re.sub(r'\bfig\b', '',clean_string )#remove the word fig
        clean_string = re.sub(r'\bfigure\b', '', clean_string)#remove the word figure
        clean_string = re.sub(r'\b[a-zA-Z]\b', '', clean_string)#remove all the single letters        
        clean_string = re.sub(r'^\d+\s|\s\d+\s|\s\d+$', '', clean_string) #remove any digit that is not part of a word
        clean_text.append(clean_string) #add the string to the list 
stemmed_text=[] # a list which will contain the stemmed bodies of all docs      
for val in clean_text: #for the body (which was stripped out of stop words) of each paper
    stemmed_val="" #create an empty string
    words = val.split(' ') #split body of paper on spaces
    for word in words: #for each word in the body
        if(word!=""): #if the word is not an empty string
            stemmed_val+=ps.stem(word)+" " #stem theword and concat it to the string stemned_val
    stemmed_text.append(stemmed_val) #add the string stemmed_val to the list which will contain the stemmed bodies of all docs  

100%|██████████| 10/10 [00:17<00:00,  1.78s/it]


In [4]:
text = ' '.join(stemmed_text) #convert all the stemmed bodies into one string containing all the stemmed bodies of all papers 
s=text.split(' ')#split the combination of papers body on space
vocab_count = Counter(s)# count the number of vocab (unique words)
most_occur = vocab_count.most_common(200) #get the most common 200 words
print(most_occur) #print the most common 50 words

[('use', 167), ('read', 148), ('sequenc', 147), ('sampl', 128), ('data', 126), ('ibv', 83), ('case', 76), ('test', 74), ('licens', 74), ('differ', 73), ('model', 71), ('peer-review', 68), ('medrxiv', 68), ('holder', 67), ('covid-19', 67), ('viru', 65), ('analysi', 64), ('avail', 63), ('one', 60), ('author/fund', 60), ('averag', 60), ('classif', 60), ('studi', 59), ('result', 58), ('classifi', 57), ('rate', 55), ('two', 54), ('doi', 54), ('number', 52), ('speci', 50), ('confirm', 49), ('daili', 49), ('temperatur', 49), ('set', 48), ('show', 44), ('made', 44), ('new', 44), ('patient', 43), ('genom', 42), ('display', 42), ('allow', 41), ('all', 41), ('grant', 41), ('perpetu', 40), ('detect', 40), ('viral', 39), ('level', 39), ('includ', 39), ('net', 39), ('report', 38), ('pk', 38), ('isol', 37), ('signific', 37), ('compar', 36), ('thu', 36), ('order', 36), ('train', 36), ('base', 34), ('tree', 34), ('contain', 33), ('symptom', 33), ('cc-by-nc-ndintern', 33), ('tabl', 33), ('for', 33), ('r

In [5]:
# result of first common 50 words in all the 1943 docs
#[('preprint', 42993), ('q', 29585), ('license', 20372), ('1', 18167), ('holder', 18100), ('medrxiv', 17564), ('author/funder', 17404), ('cases', 17021), ('peer-reviewed', 16697), ('https', 16223), ('data', 16221), ('covid-19', 15705), ('2', 14649), ('patients', 14258), ('doi', 14057), ('number', 13892), ('available', 13229), ('using', 11934), ('model', 11910), ('figure', 11428), ('display', 11002), ('granted', 10804), ('40', 10803), ('perpetuity', 10751), ('made', 10742), ('also', 10542), ('international', 10212), ('time', 9852), ('fig', 9747), ('3', 9574), ('used', 9344), ('cells', 9104), ('infection', 8833), ('study', 8712), ('sars-cov-2', 8633), ('r', 8195), ('5', 8153), ('this', 8143), ('virus', 8097), ('two', 7845), ('disease', 7820), ('may', 7774), ('s', 7772), ('rate', 7707), ('all', 7528), ('without', 7464), ('biorxiv', 7352), ('one', 7332), ('infected', 7203), ('10', 7201)]

In [6]:
# a testing cell to test lines of code before adding them to our code
dd = re.sub(r'^\d+\s|\s\d+\s|\s\d+$', '', 'abc 12') 
print (dd)
print(ps.stem('include'))

abc
includ


In [7]:
#tf- idf
# # finding the tf-idf matrix for all the abstracts
# # vec = TfidfVectorizer()

# # ve = vec.fit_transform(clean_abs)

# # # displaying the tf-idf of the word in the abstact  
# # pd.DataFrame(ve.toarray(), columns=sorted(vec.vocabulary_.keys()))

# # #applying tf-idf cosine similarity and printing the result(testing)
# # query = ["prophylaxis"]
# # query_tfidf = vec.transform(query)
# # cosineSimilarities = cosine_similarity(query_tfidf, ve).flatten()
# # print(cosineSimilarities)

In [8]:
# taged_abs=[]
# nouns=[]
# # tagging the clean data
# for val in clean_abs:
#     taged_abs.append(pos_tag(val.split()))
# # extracting the nouns
# for i in range(len(taged_abs)):
#     doc=[]
#     for j in range(len(taged_abs[i])):
#         if(taged_abs[i][j][1]=='NN' or taged_abs[i][j][1]=='NNS'):
#             doc.append(taged_abs[i][j][0])
#     nouns.append(doc)
        
# print(nouns)



Above we dcided to calculate the tf-idf so that we can represent every word that is present in the abstract quatitavily. By doing so we can further use the results in order model the topic according to the abstract that we just quatified.Furthermore we will use (Non-negative Matrix Factorization) NMF in order to come up with topic's that carry most weight in the abstract. To accomplish this we are going to filter all the nouns that are avaliable in the abstract and use them to represent the different topis that are avaliable.