In [26]:
import os
import json
import nltk
import pandas as pd 
from tqdm import tqdm
from nltk.corpus import stopwords #import the stopwords from the ntlk.corpus library
nltk.download('stopwords')
from nltk.tokenize import word_tokenize #import the word_tokenize method, which is used to turn sentences into words"
import re #import the regular expressions library
nltk.download('punkt')
from collections import Counter
from nltk.stem import PorterStemmer 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#directories = ["biorxiv_medrxiv","noncomm_use_subset","comm_use_subset"]
directories = ["biorxiv_medrxiv"]

In [3]:
papers = [] #create a list which will hold all research papers. Each research paper will have the following: title, abstract, and body
for directory in directories: #for each of the three folders carrying the json format of different research papers
    for file in tqdm(os.listdir(f"{directory}/{directory}/pdf_json")): #for every json file
        file_path=f"{directory}/{directory}/pdf_json/{file}" #set the file path to the file_path variable
        json_file=json.load(open(file_path,"rb")) #read the json file
        paper_title = json_file ['metadata']['title'] #get the paper title from that research paper
        try:#check if the paper has an abstract
            paper_abstract = json_file["abstract"][0].values() 
        except:#if the paper does not have an abstract 
            paper_abstract="" #the paper's abtract is set to an empty string
        paper_body="" #set the paper's body to an empty string initially
        for text in json_file["body_text"]:#for every text in the paper
            paper_body += text['text']+'\n\n'   #concatenate it to the variable which will hold the body of the paper 
        papers.append([paper_title,paper_abstract,paper_body]) #add the paper to the papers list

100%|██████████| 1934/1934 [00:28<00:00, 67.59it/s] 


In [4]:
df = pd.DataFrame(papers, columns = ['paper_title','paper_abstract','paper_body']) #created a new data frame which holds the three major cols for each paper
df.head()#check the first 5 papers in the dataframe

Unnamed: 0,paper_title,paper_abstract,paper_body
0,The RNA pseudoknots in foot-and-mouth disease ...,(word count: 194 22 Text word count: 5168 23 2...,"VP3, and VP0 (which is further processed to VP..."
1,Analysis Title: Regaining perspective on SARS-...,"(During the past three months, a new coronavir...","In December 2019, a novel coronavirus, SARS-Co..."
2,Healthcare-resource-adjusted vulnerabilities t...,,The 2019-nCoV epidemic has spread across China...
3,Relationship between Average Daily Temperature...,(The rapid outbreak of the new Coronavirus pan...,The outbreak of infectious diseases has always...
4,CHEER: hierarCHical taxonomic classification f...,(The fast accumulation of viral metagenomic da...,"Metagenomic sequencing, which allows us to dir..."


In [27]:
#Step 1: tokenize
#Step 2: remove stopwords
#Step 3: stem ""

#Therefore, to start with step 1, fetch the paper_body col and save it in a list
clean_text_bodies = [] #a list that will hold the bodies of all papers without stop words
papers_body = df['paper_body'].head()
for paperBody in tqdm(papers_body):
        body_tokens = word_tokenize(paperBody) #tokenize the words in the body of the paper
        paper_body_without_stopwords = [token for token in body_tokens if not token in stopwords.words('english')] #remove the stop words in the body and return a list
        clean_string = ' '.join(paper_body_without_stopwords) #convert the list into string
        clean_text_bodies.append(clean_string) #add the string to the array
#Now that all the bodies have been tokenized and stop words removed. A stemming process will be applied 
stemmed_text=[]   # a list that will hold all the stemmed text    
ps= PorterStemmer() 
for val in clean_text_bodies: #for every paper body which has been cleaned from stop words, stem it
    stemned_val="" # a string that will hold the new body of the paper after stemming
    for word in val: #loop on every word in the clean paper body
        stemned_val+=ps.stem(word)+" " #stem the word
    stemmed_text.append(stemned_val) #added it to the list which will hold all the stemmed values
text = ' '.join(stemmed_text)    
s=text.split(' ')
r = Counter(s)
most_occur = r.most_common(50) 
print(most_occur)

100%|██████████| 5/5 [00:13<00:00,  2.72s/it]


[(',', 666), ('.', 652), (')', 170), ('(', 166), ('The', 114), ('reads', 89), ('[', 74), (']', 74), ('preprint', 68), (':', 66), ('classification', 57), ('In', 55), ('data', 55), ('average', 54), ('species', 49), ('model', 48), ('cases', 47), ('daily', 47), ('different', 43), ('using', 43), ('rate', 42), ('temperature', 41), ('confirmed', 39), ('two', 37), ('number', 37), ('analysis', 36), ('1', 35), ('Fig', 34), ('one', 33), ('new', 33), ('license', 32), ('layer', 32), ('viral', 30), ('virus', 29), ('peer-reviewed', 29), ('available', 29), ('CHEER', 29), ('copyright', 28), ('holder', 28), ('level', 28), ('https', 27), ('epidemic', 27), ('phylogenetic', 27), ('medRxiv', 27), ('classifier', 27), ('author/funder', 26), ('3', 26), ('RNA', 26), ('tree', 26), ('training', 26)]


In [None]:
# incubation_df = df[df['full_text'].str.contains('incubation')]
# # print (incubation_df.head())
# texts = incubation_df['full_text'].values 
# for t in texts:
#    # print (t)
#     for sentence in t.split(". "):
#         if "antiviral" in sentence:
# #             arr_matches = re.findall(r" \d{1,2} day",sentence) #look for regex in sentence
#             arr_matches = re.findall(r"\w*-19\b",sentence)
#             if (len(arr_matches) ==1):
#                 print(arr_matches[0])
#                 print(sentence) 
#                 print()
#                 print()

In [None]:
#         for key in j:
#             print(key)
# print (j['metadata'])

In [None]:
# for k in j['metadata']:
#     print(k)