In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# pd.options.display.max_colwidth = 500
from tqdm import tqdm

from multiprocessing import cpu_count, Pool #for multiprocessing data
cores = cpu_count()
import re

import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

from sklearn.feature_extraction.text import TfidfVectorizer 

## Import Data into DataFrame object

In [2]:
data = pd.read_csv('all_sources_metadata_2020-03-13.csv')

In [3]:
data = data.dropna(subset=['abstract'])
data.head(2) #check out what it looks like 

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3006065000.0,#1043,True
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,177663100.0,#1999,True


### To build TFIDF matrix, we'll only use the abstract, and the title of each paper. 
We'll also keep its index in the original dataframe.

In [4]:
abstracts_df = data[['abstract','title']]
abstracts_df.reset_index(level=0, inplace=True)
abstracts_df = abstracts_df.rename(columns={'index':'indice','abstract':'content'})
#renamed index to indice to avoid conflics with .index
abstracts_df.head(2)

Unnamed: 0,indice,content,title
0,2,The geographic spread of 2019 novel coronaviru...,Incubation Period and Other Epidemiological Ch...
1,3,"In December 2019, cases of unidentified pneumo...",Characteristics of and Public Health Responses...


Here we define the functions we'll use to clean the data frame. Added a simple multiprocessing function to speed things up a bit. 

In [5]:
def remove_sw(l):
    return [word for word in l if word not in stopwords]

def clean(df):
    df['content_clean'] = df['content'].apply(lambda x: x.lower())
    df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('\''), '', x))
#     df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('[^a-zA-Z0-9]'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('[^a-z\ ]'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: x.split())
    df['content_clean_nsw'] = df['content_clean'].apply(lambda x: remove_sw(x))
    df.drop(df[df['content_clean'].str.len() == 0 ].index, inplace=True)
    return df

def clean_title(df):
    df['title_clean'] = df['title'].apply(lambda x: x.lower())
    df['title_clean'] = df['title_clean'].apply(lambda x: re.sub(re.compile('\''), '', x))
    df['title_clean'] = df['title_clean'].apply(lambda x: re.sub(re.compile('https\:\/\/t.co\/.{10}'), '', x))
    df['title_clean'] = df['title_clean'].apply(lambda x: re.sub(re.compile('(H*a*ha+h[ha]*|o?l+o+l+[ol]*)'), '', x))
#     df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('[^a-zA-Z0-9]'), ' ', x))
    df['title_clean'] = df['title_clean'].apply(lambda x: re.sub(re.compile('[^a-z\ ]'), ' ', x))
    df['title_clean'] = df['title_clean'].apply(lambda x: x.split())
    df['title_clean_nsw'] = df['title_clean'].apply(lambda x: remove_sw(x))
    df.drop(df[df['title_clean'].str.len() == 0 ].index, inplace=True)
    return df

def parallel_df_process(df, func, n_cores=cores):
    pool = Pool(n_cores)
    df_split = np.array_split(df, n_cores)
    df_joined = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df_joined

In [6]:
abstracts_df = parallel_df_process(abstracts_df,clean)
abstracts_df = parallel_df_process(abstracts_df,clean_title)
abstracts_df.head(2)

Unnamed: 0,indice,content,title,content_clean,content_clean_nsw,title_clean,title_clean_nsw
0,2,The geographic spread of 2019 novel coronaviru...,Incubation Period and Other Epidemiological Ch...,"[the, geographic, spread, of, novel, coronavir...","[geographic, spread, novel, coronavirus, covid...","[incubation, period, and, other, epidemiologic...","[incubation, period, epidemiological, characte..."
1,3,"In December 2019, cases of unidentified pneumo...",Characteristics of and Public Health Responses...,"[in, december, cases, of, unidentified, pneumo...","[december, cases, unidentified, pneumonia, his...","[characteristics, of, and, public, health, res...","[characteristics, public, health, responses, c..."


Let's take a few samples to see what our cleaned data looks like. 

In [7]:
abstracts = abstracts_df['content_clean_nsw'].to_list()
print(abstracts[1231])

['background', 'sparse', 'data', 'whether', 'non', 'pharmaceutical', 'interventions', 'reduce', 'spread', 'influenza', 'implemented', 'study', 'feasibility', 'efficacy', 'face', 'masks', 'hand', 'hygiene', 'reduce', 'influenza', 'transmission', 'among', 'hong', 'kong', 'household', 'members', 'methodology', 'principal', 'findings', 'conducted', 'cluster', 'randomized', 'controlled', 'trial', 'households', 'composed', 'least', 'members', 'index', 'subject', 'presented', 'influenza', 'like', 'illness', 'hours', 'duration', 'influenza', 'confirmed', 'index', 'case', 'quickvue', 'influenza', 'b', 'rapid', 'test', 'household', 'index', 'subject', 'randomized', 'control', 'surgical', 'face', 'masks', 'hand', 'hygiene', 'households', 'visited', 'within', 'hours', 'days', 'later', 'nose', 'throat', 'swabs', 'collected', 'index', 'subjects', 'household', 'contacts', 'home', 'visit', 'tested', 'viral', 'culture', 'primary', 'outcome', 'measure', 'laboratory', 'culture', 'confirmed', 'influenza',

### Now that the data is relatively clean, we'll use sklearn's implementatio of TFIDF because it's super fast. 

(Importanat to note TfidfVectorizer takes in entire strings as input, so we'll have to merge each list of words into a string.)

In [8]:
abstracts = [' '.join(x) for x in abstracts]

In [9]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True,stop_words=stopwords) #instantiate our TFIDF vectorizer object
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(abstracts) #fit our data 

In [10]:
words_list=tfidf_vectorizer.get_feature_names()

### Finally we can determine the most of important words in an abstracts

In [11]:
docs_raw = abstracts_df['content'].to_list()

In [12]:
#credit to Kavita Ganesan from https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.XnRgdi2B2u4 for these functions
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [13]:
doc = docs_raw[0]
#generate tf-idf for the given document
tf_idf_vector=tfidf_vectorizer.transform([doc])
 
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
 
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(words_list,sorted_items,10)
 
# now print the results
print("\n=====Doc=====")
print(doc)

# print(doc)
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


=====Doc=====
The geographic spread of 2019 novel coronavirus (COVID-19) infections from the epicenter of Wuhan, China, has provided an opportunity to study the natural history of the recently emerged virus. Using publicly available event-date data from the ongoing epidemic, the present study investigated the incubation period and other time intervals that govern the epidemiological dynamics of COVID-19 infections. Our results show that the incubation period falls within the range of 2&ndash;14 days with 95% confidence and has a mean of around 5 days when approximated using the best-fit lognormal distribution. The mean time from illness onset to hospital admission (for treatment and/or isolation) was estimated at 3&ndash;4 days without truncation and at 5&ndash;9 days when right truncated. Based on the 95th percentile estimate of the incubation period, we recommend that the length of quarantine should be at least 14 days. The median time delay of 13 days from illness onset to death (1

In [14]:
#converting the above code to a function, we have
def get_keywords(doc,topn=10):
    tf_idf_vector=tfidf_vectorizer.transform([doc])
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(words_list,sorted_items,topn)
    return [(key, keywords[key]) for key in keywords]

### Now we can use this model to tag the abstracts in the original dataframe.

In [15]:
def add_keywords_df(df):
    df['keywords'] = df['abstract'].apply(lambda x: get_keywords(x))
    return df

In [16]:
data = parallel_df_process(data, add_keywords_df)

In [17]:
data.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,keywords
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3006065000.0,#1043,True,"[(ndash, 0.397), (days, 0.392), (incubation, 0..."
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,177663100.0,#1999,True,"[(fatality, 0.266), (cases, 0.263), (covid, 0...."
5,0df0d5270a9399cf4e23c0cdd877a80616a9725e,CZI,An updated estimation of the risk of transmiss...,10.1016/j.idm.2020.02.001,,,cc-by-nc-nd,The basic reproduction number of an infectious...,2020,"Tang, Biao; Bragazzi, Nicola Luigi; Li, Qian; ...",Infectious Disease Modelling,3006029000.0,#729,True,"[(reproduction, 0.317), (peak, 0.224), (number..."
6,f24242580be243d5fc3f432915d86af6854bb8b7,CZI,Real-time forecasts of the 2019-nCoV epidemic ...,10.1016/j.idm.2020.02.002,,,cc-by-nc-nd,The initial cluster of severe pneumonia cases ...,2020,"Roosa, K.; Lee, Y.; Luo, R.; Kirpich, A.; Roth...",Infectious Disease Modelling,3006029000.0,#865,True,"[(hubei, 0.375), (february, 0.253), (cumulativ..."
8,e1b336d8be1a4c0ccc5a1bf41e48b3b004d3ece1,CZI,COVID-19 outbreak on the Diamond Princess crui...,10.1093/jtm/taaa030,,,cc-by-nc,Cruise ships carry a large number of people in...,2020,"Rocklöv, J.; Sjödin, H.; Wilder-Smith, A.",Journal of Travel Medicine,3006304000.0,#2926,True,"[(passengers, 0.435), (crew, 0.326), (february..."


In [18]:
data.to_csv('all_sources_metadata_2020-03-13_with_keywords.csv', index = False)
del data

### We'll also include the TFIDF vectors in the abstracts dataframe so we can use it in the next notebook where we'll do some topic modeling.

In [19]:
def add_vectors_to_df(df):
    df['vector'] = df['content'].apply(lambda x: tfidf_vectorizer.transform([x]).toarray()[0])
    return df

In [20]:
abstracts_df = parallel_df_process(abstracts_df, add_vectors_to_df)
abstracts_df.head()

Unnamed: 0,indice,content,title,content_clean,content_clean_nsw,title_clean,title_clean_nsw,vector
0,2,The geographic spread of 2019 novel coronaviru...,Incubation Period and Other Epidemiological Ch...,"[the, geographic, spread, of, novel, coronavir...","[geographic, spread, novel, coronavirus, covid...","[incubation, period, and, other, epidemiologic...","[incubation, period, epidemiological, characte...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,3,"In December 2019, cases of unidentified pneumo...",Characteristics of and Public Health Responses...,"[in, december, cases, of, unidentified, pneumo...","[december, cases, unidentified, pneumonia, his...","[characteristics, of, and, public, health, res...","[characteristics, public, health, responses, c...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,5,The basic reproduction number of an infectious...,An updated estimation of the risk of transmiss...,"[the, basic, reproduction, number, of, an, inf...","[basic, reproduction, number, infectious, agen...","[an, updated, estimation, of, the, risk, of, t...","[updated, estimation, risk, transmission, nove...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,6,The initial cluster of severe pneumonia cases ...,Real-time forecasts of the 2019-nCoV epidemic ...,"[the, initial, cluster, of, severe, pneumonia,...","[initial, cluster, severe, pneumonia, cases, t...","[real, time, forecasts, of, the, ncov, epidemi...","[real, time, forecasts, ncov, epidemic, china,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,8,Cruise ships carry a large number of people in...,COVID-19 outbreak on the Diamond Princess crui...,"[cruise, ships, carry, a, large, number, of, p...","[cruise, ships, carry, large, number, people, ...","[covid, outbreak, on, the, diamond, princess, ...","[covid, outbreak, diamond, princess, cruise, s...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [21]:
abstracts_df.to_pickle('abstracts_dataframe')

In [22]:
abstracts_df.head()

Unnamed: 0,indice,content,title,content_clean,content_clean_nsw,title_clean,title_clean_nsw,vector
0,2,The geographic spread of 2019 novel coronaviru...,Incubation Period and Other Epidemiological Ch...,"[the, geographic, spread, of, novel, coronavir...","[geographic, spread, novel, coronavirus, covid...","[incubation, period, and, other, epidemiologic...","[incubation, period, epidemiological, characte...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,3,"In December 2019, cases of unidentified pneumo...",Characteristics of and Public Health Responses...,"[in, december, cases, of, unidentified, pneumo...","[december, cases, unidentified, pneumonia, his...","[characteristics, of, and, public, health, res...","[characteristics, public, health, responses, c...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,5,The basic reproduction number of an infectious...,An updated estimation of the risk of transmiss...,"[the, basic, reproduction, number, of, an, inf...","[basic, reproduction, number, infectious, agen...","[an, updated, estimation, of, the, risk, of, t...","[updated, estimation, risk, transmission, nove...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,6,The initial cluster of severe pneumonia cases ...,Real-time forecasts of the 2019-nCoV epidemic ...,"[the, initial, cluster, of, severe, pneumonia,...","[initial, cluster, severe, pneumonia, cases, t...","[real, time, forecasts, of, the, ncov, epidemi...","[real, time, forecasts, ncov, epidemic, china,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,8,Cruise ships carry a large number of people in...,COVID-19 outbreak on the Diamond Princess crui...,"[cruise, ships, carry, a, large, number, of, p...","[cruise, ships, carry, large, number, people, ...","[covid, outbreak, on, the, diamond, princess, ...","[covid, outbreak, diamond, princess, cruise, s...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
