In [1]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 95%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [None]:
import os
import sys
import time
import json; print('JSON version:', json.__version__)
import pandas as pd; print('Pandas version:', pd.__version__)
import numpy as np; print('Numpy version:', np.__version__)
from tqdm import tqdm; print('tqdm version:', pd.__version__)
import nltk; print('NLTK version:', nltk.__version__)
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
# nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet') 
import string
import re; print('Regex version:', re.__version__)
import wordcloud; print('Wordcloud version:', wordcloud.__version__)
from wordcloud import WordCloud
from PIL import Image
import gensim; print('Gensim version:', gensim.__version__)
from gensim.models import Word2Vec
from collections import Counter
import matplotlib; print('Matplotlib version:', matplotlib.__version__)
import matplotlib.pyplot as plt
%matplotlib inline
print("Python:", sys.version)

file_dir = os.getcwd(); file_dir

In [None]:
with open(file_dir + '\metadata.readme', 'r') as fm:
    data_meta = fm.read()
    print(data_meta)

In [None]:
meta = pd.read_csv(file_dir + "/metadata.csv", low_memory=False)
meta.head()

In [None]:
counter = 0
file_list = []
for dirname, _, filenames in os.walk(file_dir):
    for filename in filenames:
        if filename[-5:]==".json":
            file_list.append(os.path.join(dirname, filename))

file_list.sort()
total_files = len(file_list); total_files

In [None]:
# abstracts only - uncomment as needed to include other pieces

start = time.time()
docs = []
# all_docs = []

for file in tqdm(file_list):
    j = json.load(open(file, "rb"))
#     paper_id = j['paper_id']
#     title = j['metadata']['title']
       
    abstract = ""
    
    try:
        if j['abstract']:
                for entry in j['abstract']:
                    abstract += entry['text'] +'\n\n'
    except KeyError:
            pass 
            
#     all_bodytext = ""
        
#     for text in j['body_text']:
#         all_bodytext += text['text'] +'\n\n'

#     all_docs.append([paper_id, title, abstract, all_bodytext])
    docs.append([abstract])

end = time.time()
print("Processed in %s seconds" % (end-start))

In [None]:
with open('all_abstracts.txt', 'w', encoding="utf-8") as f:
    for words in docs:
        f.write("%s\n" % words)

In [None]:
start = time.time()

with open('all_abstracts.txt', encoding="utf-8") as f, open('all_abstracts_tokens.txt', 'w', encoding="utf-8") as out_f:
    text = f.read().lower() #read and lower the case
    short_words = re.compile(r'\W*\b\w{1,3}\b')
    text = short_words.sub('', text) # get rid of short words (less than four letters long)
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = stopwords.words('english')
    new_stop_words = ['preprint', 'copyright', 'holder', 'peerreviewed', 'authorfunder', 'license', 'medrxiv', 'biorxiv',
                     'righta', 'reuse', 'reserved', 'also', 'used', 'found', 'using', 'however']
    stop_words.extend(new_stop_words) #enhance NLTK's list of stop words to be removed from the tokenized text
    words = [w for w in words if not w in stop_words]
    new_text = ' '.join(words)
    plt.figure(figsize=(16, 7))
    fd = nltk.FreqDist(words)
    fd.plot(40,title = "40 Most Frequent Words", cumulative=False)    
    out_f.write(new_text)
    
end = time.time()
print("Processed in %s seconds" % (end-start))

In [None]:
bigrams = nltk.bigrams(words)
freq_bigrams = nltk.FreqDist(bigrams)

plt.figure(figsize=(14, 7))
freq_bigrams.plot(20)

In [None]:
with open('all_abstracts_tokens.txt', encoding = "utf-8") as f, open('all_abstracts_lemmas.txt', 'w', encoding = "utf-8") as out_f:
    text = f.read()
    tokens = word_tokenize(text)
    lemma = WordNetLemmatizer()
    lemmed = [lemma.lemmatize(word) for word in tokens]
    new_lem_text = ' '.join(lemmed)
    out_f.write(new_lem_text)

In [None]:
start = time.time()

lemma_text = open('all_abstracts_lemmas.txt', 'rt', encoding="utf-8").read()

wc = WordCloud(max_font_size=200,
                      width=2500,
                      height=2000,
                      max_words=4000,
                      random_state=44,
                      collocations = False,
                     ).generate(lemma_text)

plt.figure(figsize=(32, 14))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("CORD abstracts", fontsize= 20)

end = time.time()
print("Processed in %s seconds" % (end-start))

In [None]:
start = time.time()

n = 4
word = r'\W*([\w]+)'
text_search_como = re.findall(r'{}\W*{}{}'.format(word*n,'(?:comorbid|comorbidity|comorbidities|comorbid |comorbidity |comorbidities | comorbid| comorbidity| comorbidities| comorbid | comorbidity | comorbidities )',word*n), lemma_text)
# print(text_search)

end = time.time()
print("Processed in %s seconds" % (end-start))

In [None]:
df_text_search_como = pd.DataFrame(text_search)
df_text_search_como.to_csv('text_search_como.csv')

In [None]:
flatten_text_search = [element for sublist in text_search_como for element in sublist if len(element) >3]         
# print(flatten_text_search) 

In [None]:
len(flatten_text_search)

In [None]:
sorted_counts = pd.DataFrame(Counter(flatten_text_search).most_common())
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # show all sorted counts data
    print(sorted_counts)

In [None]:
plt.rcParams.update({'figure.figsize':(24, 10), 'figure.dpi':300})

counts = dict(Counter(flatten_text_search).most_common(150))
labels, values = zip(*counts.items())

# sort your values in descending order
indSort = np.argsort(values)[::-1]

# rearrange your data
labels = np.array(labels)[indSort]
values = np.array(values)[indSort]

indexes = np.arange(len(labels))

width = 0.35

plt.bar(indexes, values)

# add labels
plt.xticks(rotation=90)
plt.xticks(indexes + width * 0.05, labels)

In [None]:
#same as abstracts-only above, just everything is uncommented

start = time.time()
all_docs = []

for file in tqdm(file_list):
    j = json.load(open(file, "rb"))
    paper_id = j['paper_id']
    title = j['metadata']['title']
       
    abstract = ""
    
    try:
        if j['abstract']:
                for entry in j['abstract']:
                    abstract += entry['text'] +'\n\n'
    except KeyError:
            pass 
            
    all_bodytext = ""
        
    for text in j['body_text']:
        all_bodytext += text['text'] +'\n\n'

    all_docs.append([paper_id, title, abstract, all_bodytext])

end = time.time()
print("Processed in %s seconds" % (end-start))

In [None]:
with open('all_docs.txt', 'w', encoding="utf-8") as f:
    for words in all_docs:
        f.write("%s\n" % words)

In [None]:
start = time.time()

with open('all_docs.txt', encoding="utf-8") as f, open('all_docs_tokens.txt', 'w', encoding="utf-8") as out_f:
    text = f.read().lower()
    short_words = re.compile(r'\W*\b\w{1,3}\b')
    text = short_words.sub('', text)
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = stopwords.words('english')
    new_stop_words = ['preprint', 'copyright', 'holder', 'peerreviewed', 'authorfunder', 'license', 'medrxiv', 'biorxiv',
                     'righta', 'reuse', 'reserved', 'also', 'used', 'found', 'using', 'however']
    stop_words.extend(new_stop_words)
    words = [w for w in words if not w in stop_words]
    new_text = ' '.join(words)
    plt.figure(figsize=(16, 7))
    fd = nltk.FreqDist(words)
    fd.plot(40,title = "40 Most Frequent Words", cumulative=False))
    out_f.write(new_text)
    
end = time.time()
print("Processed in %s seconds" % (end-start))

In [None]:
bigrams = nltk.bigrams(words)
freq_bigrams = nltk.FreqDist(bigrams)
plt.figure(figsize=(16, 7))
freq_bigrams.plot(40)

In [None]:
with open('all_docs_tokens.txt', encoding = "utf-8") as f, open('all_docs_lemmas.txt', 'w', encoding = "utf-8") as out_f:
    text = f.read()
    tokens = word_tokenize(text)
    lemma = WordNetLemmatizer()
    lemmed = [lemma.lemmatize(word) for word in tokens]
    new_lem_text = ' '.join(lemmed)
    out_f.write(new_lem_text)

In [None]:
start = time.time()

lemma_text_all = open('all_docs_lemmas.txt', 'rt', encoding="utf-8").read()

wc = WordCloud(max_font_size=200,
                      width=2500,
                      height=2000,
                      max_words=4000,
                      random_state=44,
                      collocations = False,
                     ).generate(lemma_text_all)

plt.figure(figsize=(32, 14))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("CORD documents", fontsize= 20)

end = time.time()
print("Processed in %s seconds" % (end-start))

In [None]:
n = 4
word = r'\W*([\w]+)'
text_search_all = re.findall(r'{}\W*{}{}'.format(word*n,'(?:comorbid|comorbidity|comorbidities|comorbid |comorbidity |comorbidities | comorbid| comorbidity| comorbidities| comorbid | comorbidity | comorbidities )',word*n), lemma_text_all)

In [None]:
flatten_text_search_all = [element for sublist in text_search_all for element in sublist if len(element) >3] 
sorted_counts = pd.DataFrame(Counter(flatten_text_search_all).most_common())
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # show all sorted counts data
    print(sorted_counts)

In [None]:
plt.rcParams.update({'figure.figsize':(24, 10), 'figure.dpi':300})

counts = dict(Counter(flatten_text_search_all).most_common(150))
labels, values = zip(*counts.items())

# sort your values in descending order
indSort = np.argsort(values)[::-1]

# rearrange your data
labels = np.array(labels)[indSort]
values = np.array(values)[indSort]

indexes = np.arange(len(labels))

width = 0.35

plt.bar(indexes, values)

# add labels
plt.xticks(rotation=90)
plt.xticks(indexes + width * 0.05, labels)

In [None]:
df_all_docs = pd.DataFrame(all_docs, columns=['paper_id', 'title', 'abstract', 'all_bodytext'])
print(df_all_docs.head())

In [None]:
len(df_all_docs)

In [None]:
df_risk = df_all_docs[(df_all_docs['abstract'].str.contains('comorbid')) | (df_all_docs['abstract'].str.contains('Comorbid')) | (df_all_docs['abstract'].str.contains('comorbidity')) | (df_all_docs['abstract'].str.contains('Comorbidity')) | (df_all_docs['abstract'].str.contains('comorbidities')) | (df_all_docs['abstract'].str.contains('Comorbidities'))
                  | (df_all_docs['all_bodytext'].str.contains('comorbid')) | (df_all_docs['all_bodytext'].str.contains('Comorbid')) | (df_all_docs['all_bodytext'].str.contains('comorbidity')) | (df_all_docs['all_bodytext'].str.contains('Comorbidity')) | (df_all_docs['all_bodytext'].str.contains('comorbidities')) | (df_all_docs['all_bodytext'].str.contains('Comorbidities'))]

df_risk.head()

In [None]:
df_risk.head()

In [None]:
abstract_searched = df_risk['abstract'].values
bodytext_searched = df_risk['all_bodytext'].values

In [None]:
start = time.time()

df_risk_sentences = pd.DataFrame([])

for s in tqdm(abstract_searched):
    for sentence in s.split('. '):
        if "comorbid" in sentence:
            risk_sentences = pd.DataFrame([sentence])
            df_risk_sentences  = df_risk_sentences.append(risk_sentences)
#             df_risk_sentences.to_csv("df_risk_sentences.csv", encoding='utf-8', index=False)

with pd.option_context('display.max_rows', None):  # show all risk sentences
    print(df_risk_sentences)

end = time.time()
print("Processed in %s seconds" % (end-start))

In [None]:
len(df_risk_sentences)

In [None]:
search_term = df_risk[(df_risk['abstract'].str.contains('It has been noted that elderly patients'))]
with pd.option_context('display.max_rows', None):  
    print(search_term)

In [None]:
paper_id = "179df1e769292dd113cef1b54b0b43213e6b5c97.json"

counter = 0
file_select = []
for dirname, _, filenames in os.walk(file_dir):
    for filename in filenames:
#         print(os.path.join(dirname, filename))
        if filename==paper_id:
            file_select.append(os.path.join(dirname, filename))

file_select = ''.join(file_select) #convert to string

with open(file_select) as json_file:
    json_data = json.load(json_file)
for excerpt in json_data['abstract']:
    with pd.option_context('display.max_rows', None):  # show all risk sentences
        print(excerpt)

In [None]:
df_risk_sentences_bodytext = pd.DataFrame([])

for s in tqdm(bodytext_searched):
    for sentence in s.split('. '):
        if "comorbid" in sentence:
            risk_sentences = pd.DataFrame([sentence])
            df_risk_sentences_bodytext  = df_risk_sentences_bodytext.append(risk_sentences)
#             df_risk_sentences_bodytext.to_csv("df_risk_sentences_alldocs.csv", encoding='utf-8', index=False)

with pd.option_context('display.max_rows', None):  # show all risk sentences
    print(df_risk_sentences_bodytext)

In [None]:
len(df_risk_sentences_bodytext)

In [None]:
start = time.time()

df_risk_sentences_bodytext = pd.DataFrame([])

for s in tqdm(bodytext_searched):
    for sentence in s.split('. '):
        if "risk factor" in sentence:
            risk_sentences = pd.DataFrame([sentence])
            df_risk_sentences_bodytext  = df_risk_sentences_bodytext.append(risk_sentences)
            df_risk_sentences_bodytext.to_csv("df_risk_sentences_alldocs.csv", encoding='utf-8', index=False)

with pd.option_context('display.max_rows', None):  
    print(df_risk_sentences_bodytext)
    
end = time.time()
print("Processed in %s seconds" % (end-start))

In [None]:
search_term_bodytext = df_risk[(df_risk['all_bodytext'].str.contains('The association between comorbidities and ALI'))]
with pd.option_context('display.max_rows', None):  
    print(search_term_bodytext)

In [None]:
paper_id = "061ffcdd4d674c4d7ce24e4aa7c5037c68596864.json"

counter = 0
file_select = []
for dirname, _, filenames in os.walk(file_dir):
    for filename in filenames:
#         print(os.path.join(dirname, filename))
        if filename==paper_id:
            file_select.append(os.path.join(dirname, filename))

file_select = ''.join(file_select) #convert to string

with open(file_select) as json_file:
    json_data = json.load(json_file)
for excerpt in json_data['body_text']:
    with pd.option_context('display.max_rows', None):  
        print(excerpt)

In [None]:
sample = open('kaggle/working/all_docs.txt', 'r', encoding = 'utf-8') 
s = sample.read() 
  
# Replaces escape character with space 
f = s.replace("\n", " ") 
  
data = [] 
  
# iterate through each sentence in the file 
for i in tqdm(sent_tokenize(f)): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

In [None]:
model_skipgram = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5, sg = 1) 

In [None]:
similar_words_skipgram = {search_term: [item for item in model_skipgram.wv.most_similar([search_term], topn=300)]
                  for search_term in ['comorbidity']}
similar_words_skipgram

In [None]:
print("Cosine similarity between 'comorbidity' " + "and 'asthma' - SG : ", 
    round(model_skipgram.wv.similarity('comorbidity', 'asthma'),2)) 