This is a Minimal Working Example of text mining on the dataset offered by <a href="https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge">COVID-19 Open Research Dataset Challenge (CORD-19)</a> in order to find literature related to COVID-19 diagnose tests.

Below is the demo of a program that searches for PCR-related keywords in the literature data, and lists the publications with the highest keyword match.

Imports

In [4]:
import os
from collections import OrderedDict
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

Functions

In [32]:
def find_kw(kw, text):
	"""
	Find a keyword 'kw' in 'text' and return its number of occurences.
	kw must be lowered.
	"""

	# Count
	count = text.count(kw)
	return count

def prepare_stopwords():
	""" Prepare the set of words to be stopped in English. """
	global stopwords
	stopwords = set(stopwords.words('english'))

def count_words(text):
	""" Prepare a piece of text (a string): lower, stopwords, and return dictionary of words with frequency of occurence. """

	# Lower the text
	text = text.lower()
	# Tokenize words
	words = word_tokenize(text)
	# Stop words
	words_ = [w for w in words if w not in stopwords]
	# Set of words
	wordset = set(sorted(words_))

	# Dictionary of words
	words = OrderedDict()
	for w in wordset:
		words[w] = words_.count(w)
	
	return words

def search_kw(keywords, min_freq=0):
	"""
	Search for keywords in all the abstracts of the metadata.
	Iterate over all abstracts and yield a dictionary of occurences as {cord_uid: occurences}.
	"""

	occurences = {}

	# Iterate over all abstracts
	for cord_uid, abstract in zip(metadata.cord_uid, metadata.abstract):
		# Get the words in the abstract
		words = count_words(abstract)
		# Get the number of occurences of the word in this abstract
		for kw in keywords:
			# Look for it in the abstract's words
			try:
				n = words[kw]
			except KeyError:
				pass
			else:
				# Add it to the counter
				try:
					occurences[cord_uid] += n
				except KeyError:
					occurences[cord_uid] = n

	# occurences to DataFrame
	occurences = pd.DataFrame({'cord_uid': list(occurences.keys()), 'frequency': list(occurences.values())})

	# Filter using min_freq
	occurences = occurences[occurences.frequency >= min_freq]

	# Sort by frequency in descending order
	occurences.sort_values(by='frequency', ascending=False, inplace=True, ignore_index=True)

	return occurences

In [8]:
# Define folder paths
cwd = os.getcwd()
folders = {'corddata': os.path.join(cwd, 'corddata')}

In [9]:
# Prepare the stopwords list
prepare_stopwords()

In [30]:
# Read meta-data
# Select only a small fraction of the data
metadata = pd.read_csv(os.path.join(folders['corddata'], 'metadata.csv')).loc[::10]

In [31]:
print('Full metadata file:')
print(metadata.shape[0])

Full metadata file:
19572


In [33]:
# Filter null abstracts
metadata.dropna(subset=['abstract'], inplace=True)
print('After removing null abstracts:')
print(metadata.shape[0])

# Filter null texts
has_full_text = ~(pd.isna(metadata.pdf_json_files) & pd.isna(metadata.pmc_json_files))
metadata = metadata.loc[has_full_text].reset_index()
print('After removing null full texts:')
print(metadata.shape[0])

After removing null abstracts:
13943
After removing null full texts:
6967


Keyword Selection

In [34]:
# Select keywords to search for
kw = ['rt', 'pcr', 'polymerase', 'chain']

Keyword Search

In [35]:
# Search the keywords in the metadata
occurences = search_kw(kw, min_freq=2)
occurences.head()

Unnamed: 0,cord_uid,frequency
0,qyb8er14,14
1,1fv5k1h6,12
2,iexd5qn5,11
3,5xtc2odp,9
4,44jyy79k,9


Show the papers with the highest frequency of the keywords

In [36]:
# Nice display of the results
results = pd.merge(occurences, metadata, how='left', left_on='cord_uid', right_on='cord_uid')
results = results[['frequency', 'title', 'publish_time', 'url', 'abstract']]

results[['frequency', 'title', 'publish_time', 'url']].head()

Unnamed: 0,frequency,title,publish_time,url
0,14,Two Multiplex Real-Time PCR Assays to Detect a...,2016-07-08,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
1,12,Development of polymerase chain reaction-based...,2017-01-21,https://www.ncbi.nlm.nih.gov/pubmed/28574020/;...
2,11,"Alternative divalent cations (Zn(2+), Co(2+), ...",2015-05-03,https://doi.org/10.1186/s12858-015-0041-x; htt...
3,9,Characterizing the Qatar advanced-phase SARS-C...,2020-07-19,http://medrxiv.org/cgi/content/short/2020.07.1...
4,9,Hybridization Chain Reactions Targeting the Se...,2020-05-01,https://doi.org/10.3390/ijms21093216; https://...


In [42]:
# Show full URLs of the DataFrame above
for url in results.url[:5]:
    print(url)

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4938629/
https://www.ncbi.nlm.nih.gov/pubmed/28574020/; https://doi.org/10.4103/ijmr.ijmr_1447_15
https://doi.org/10.1186/s12858-015-0041-x; https://www.ncbi.nlm.nih.gov/pubmed/25934642/
http://medrxiv.org/cgi/content/short/2020.07.16.20155317v1?rss=1
https://doi.org/10.3390/ijms21093216; https://www.ncbi.nlm.nih.gov/pubmed/32370065/
