***Import Packages***

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import os
import matplotlib.pyplot as plt
plt.style.use('ggplot')

***Load Dataset***

In [2]:
biorxiv_dir = '/kaggle/input/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 1342


***Helper Functions***

In [3]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

***Helper Functions***

In [4]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [5]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir +"/"+ filename
    file =json.load(open(filename, 'rb'))
    all_files.append(file)

    



In [6]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body


In [7]:
from tqdm.notebook import tqdm

cleaned_files = []
    
for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),

    ]

    cleaned_files.append(features)

col_names = ['paper_id', 'title', 'authors',
             'affiliations', 'abstract', 'text',]

df = pd.DataFrame(cleaned_files, columns=col_names)
df.head()

HBox(children=(FloatProgress(value=0.0, max=1342.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text
0,4602afcb8d95ebd9da583124384fd74299d20f5b,SPINT2 inhibits proteases involved in activati...,"Marco R Straus, Jonathan T Kinder, Michal Sega...","Marco R Straus, Jonathan T Kinder (University ...",Abstract\n\nViruses possessing class I fusion ...,Introduction 9\n\nInfluenza-like illnesses (IL...
1,90b5ecf991032f3918ad43b252e17d1171b4ea63,The role of absolute humidity on transmission ...,"Wei Luo, Maimuna S Majumder, Diambo Liu, Canel...","Wei Luo (Boston Children's Hospital, 02215, Bo...",,"Introduction\n\nSince December 2019, an increa..."
2,c8a9e49416051620631005def6ea86c91cf86bc9,FEP-based screening prompts drug repositioning...,"Zhe Li, 1# , Xin Li, Yi-You Huang, Yaoxing Wu,...","Zhe Li (Sun Yat-Sen University, 510006, Guangz...","Abstract\n\nThe new coronavirus COVID-19, also...",Introduction\n\nThe novel coronavirus 2019-nCo...
3,d3c2e2839498c613ee95739dce7052109750362c,Long-Term Persistence of IgG Antibodies in SAR...,"Xiaoqin Guo, Zhongmin Guo, Chaohui Duan, Zelia...","Xiaoqin Guo (Sun Yat-sen University, 510080, G...",Abstract\n\n23 BACKGROUND 24 The ongoing world...,\n\nCC-BY-ND 4.0 International license It is m...
4,bbd9d63dc2c733c763770f62205ef9adeceb0127,Effects of temperature variation and humidity ...,"Yueling Ma, Yadong Zhao, Jiangtao Liu, Xiaotao...","Yueling Ma (Lanzhou University, 730000, Lanzho...",Abstract\n\nObject Meteorological parameters a...,"Introduction\n\nIn December 2019, a novel coro..."


In [8]:
df['abstract_word_count'] = df['abstract'].apply(lambda x: len(x.strip().split()))
df['body_word_count'] = df['text'].apply(lambda x: len(x.strip().split()))
df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,abstract_word_count,body_word_count
0,4602afcb8d95ebd9da583124384fd74299d20f5b,SPINT2 inhibits proteases involved in activati...,"Marco R Straus, Jonathan T Kinder, Michal Sega...","Marco R Straus, Jonathan T Kinder (University ...",Abstract\n\nViruses possessing class I fusion ...,Introduction 9\n\nInfluenza-like illnesses (IL...,76,6116
1,90b5ecf991032f3918ad43b252e17d1171b4ea63,The role of absolute humidity on transmission ...,"Wei Luo, Maimuna S Majumder, Diambo Liu, Canel...","Wei Luo (Boston Children's Hospital, 02215, Bo...",,"Introduction\n\nSince December 2019, an increa...",0,1511
2,c8a9e49416051620631005def6ea86c91cf86bc9,FEP-based screening prompts drug repositioning...,"Zhe Li, 1# , Xin Li, Yi-You Huang, Yaoxing Wu,...","Zhe Li (Sun Yat-Sen University, 510006, Guangz...","Abstract\n\nThe new coronavirus COVID-19, also...",Introduction\n\nThe novel coronavirus 2019-nCo...,172,4749
3,d3c2e2839498c613ee95739dce7052109750362c,Long-Term Persistence of IgG Antibodies in SAR...,"Xiaoqin Guo, Zhongmin Guo, Chaohui Duan, Zelia...","Xiaoqin Guo (Sun Yat-sen University, 510080, G...",Abstract\n\n23 BACKGROUND 24 The ongoing world...,\n\nCC-BY-ND 4.0 International license It is m...,302,1004
4,bbd9d63dc2c733c763770f62205ef9adeceb0127,Effects of temperature variation and humidity ...,"Yueling Ma, Yadong Zhao, Jiangtao Liu, Xiaotao...","Yueling Ma (Lanzhou University, 730000, Lanzho...",Abstract\n\nObject Meteorological parameters a...,"Introduction\n\nIn December 2019, a novel coro...",145,2437


***Drop dublicates data from text and abstract data***

In [9]:
df.drop_duplicates(['abstract', 'text'], inplace=True)

In [10]:
df.head(5)

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,abstract_word_count,body_word_count
0,4602afcb8d95ebd9da583124384fd74299d20f5b,SPINT2 inhibits proteases involved in activati...,"Marco R Straus, Jonathan T Kinder, Michal Sega...","Marco R Straus, Jonathan T Kinder (University ...",Abstract\n\nViruses possessing class I fusion ...,Introduction 9\n\nInfluenza-like illnesses (IL...,76,6116
1,90b5ecf991032f3918ad43b252e17d1171b4ea63,The role of absolute humidity on transmission ...,"Wei Luo, Maimuna S Majumder, Diambo Liu, Canel...","Wei Luo (Boston Children's Hospital, 02215, Bo...",,"Introduction\n\nSince December 2019, an increa...",0,1511
2,c8a9e49416051620631005def6ea86c91cf86bc9,FEP-based screening prompts drug repositioning...,"Zhe Li, 1# , Xin Li, Yi-You Huang, Yaoxing Wu,...","Zhe Li (Sun Yat-Sen University, 510006, Guangz...","Abstract\n\nThe new coronavirus COVID-19, also...",Introduction\n\nThe novel coronavirus 2019-nCo...,172,4749
3,d3c2e2839498c613ee95739dce7052109750362c,Long-Term Persistence of IgG Antibodies in SAR...,"Xiaoqin Guo, Zhongmin Guo, Chaohui Duan, Zelia...","Xiaoqin Guo (Sun Yat-sen University, 510080, G...",Abstract\n\n23 BACKGROUND 24 The ongoing world...,\n\nCC-BY-ND 4.0 International license It is m...,302,1004
4,bbd9d63dc2c733c763770f62205ef9adeceb0127,Effects of temperature variation and humidity ...,"Yueling Ma, Yadong Zhao, Jiangtao Liu, Xiaotao...","Yueling Ma (Lanzhou University, 730000, Lanzho...",Abstract\n\nObject Meteorological parameters a...,"Introduction\n\nIn December 2019, a novel coro...",145,2437


In [11]:
df.drop_duplicates(['title','abstract'], inplace=True)
df.shape

(1322, 8)

***Preprocessing***


In [None]:
#df = df.head(10000)

In [12]:
import re

df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df['abstract'] = df['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df['text'] = df['text'].apply(lambda x: re.sub('\n\n',' ',x))
df['abstract'] = df['abstract'].apply(lambda x: re.sub('\n\n',' ',x))
df['text'] = df['text'].apply(lambda x: re.sub('\d+', '',x))
df['abstract'] = df['abstract'].apply(lambda x: re.sub('\d+', '',x))


In [13]:

df['text'] = df['text'].apply(lambda x: x.lower())
df['abstract'] = df['abstract'].apply(lambda x: x.lower())
df.head(5)

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,abstract_word_count,body_word_count
0,4602afcb8d95ebd9da583124384fd74299d20f5b,SPINT2 inhibits proteases involved in activati...,"Marco R Straus, Jonathan T Kinder, Michal Sega...","Marco R Straus, Jonathan T Kinder (University ...",abstract viruses possessing class i fusion pro...,introduction influenzalike illnesses ilis rep...,76,6116
1,90b5ecf991032f3918ad43b252e17d1171b4ea63,The role of absolute humidity on transmission ...,"Wei Luo, Maimuna S Majumder, Diambo Liu, Canel...","Wei Luo (Boston Children's Hospital, 02215, Bo...",,introduction since december an increasing num...,0,1511
2,c8a9e49416051620631005def6ea86c91cf86bc9,FEP-based screening prompts drug repositioning...,"Zhe Li, 1# , Xin Li, Yi-You Huang, Yaoxing Wu,...","Zhe Li (Sun Yat-Sen University, 510006, Guangz...",abstract the new coronavirus covid also known ...,introduction the novel coronavirus ncov also k...,172,4749
3,d3c2e2839498c613ee95739dce7052109750362c,Long-Term Persistence of IgG Antibodies in SAR...,"Xiaoqin Guo, Zhongmin Guo, Chaohui Duan, Zelia...","Xiaoqin Guo (Sun Yat-sen University, 510080, G...",abstract background the ongoing worldwide ou...,ccbynd international license it is made avai...,302,1004
4,bbd9d63dc2c733c763770f62205ef9adeceb0127,Effects of temperature variation and humidity ...,"Yueling Ma, Yadong Zhao, Jiangtao Liu, Xiaotao...","Yueling Ma (Lanzhou University, 730000, Lanzho...",abstract object meteorological parameters are ...,introduction in december a novel coronavirus ...,145,2437


In [14]:
text = df.drop(["paper_id", "abstract", "abstract_word_count", "body_word_count", "authors", "title", "affiliations"], axis=1)

In [15]:
text.head(5)

Unnamed: 0,text
0,introduction influenzalike illnesses ilis rep...
1,introduction since december an increasing num...
2,introduction the novel coronavirus ncov also k...
3,ccbynd international license it is made avai...
4,introduction in december a novel coronavirus ...


In [16]:
docs = []
for x in range(0,len(text)):
    docs.append(str(text.iloc[x]['text']))

In [17]:
print(docs[5])

 arrays of regularly spaced nucleosomes dominate chromatin and are often phased by alignment to reference sites like active promoters how the distances between nucleosomes spacing and between phasing sites and nucleosomes are determined remains unclear and specifically how atp dependent chromatin remodelers impact these features here we used genomewide reconstitution to probe how saccharomyces cerevisiae atp dependent remodelers generate phased arrays of regularly spaced nucleosomes we find that remodelers bear a functional element named the ruler that determines spacing and phasing in a remodelerspecific way we use structurebased mutagenesis to identify and tune the ruler element residing in the nhp and arp modules of the ino remodeler complex generally we propose that a remodeler ruler regulates nucleosome sliding direction bias in response to epigenetic information this finally conceptualizes how remodelermediated nucleosome dynamics determine stable steadystate nucleosome positioni

***Remove Stopwords***

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer


with open('../input/stopwords/englishStopwords.txt', 'r') as f:
    myLists = [line.strip() for line in f]

               
vectorizer = TfidfVectorizer(stop_words=myLists)
X = vectorizer.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [19]:
from sklearn.model_selection import train_test_split

# test set size of 20% of the data and the random seed 42 <3
X_train, X_test = train_test_split(X.toarray(), test_size=0.2, random_state=42)

print("X_train size:", len(X_train))
print("X_test size:", len(X_test), "\n")

X_train size: 1057
X_test size: 265 



In [20]:
from sklearn.cluster import KMeans

k = 10
kmeans = KMeans(n_clusters=k, n_jobs=4, verbose=10)
y_pred = kmeans.fit_predict(X_train)

In [21]:
y_train = y_pred

In [22]:
y_test = kmeans.predict(X_test)

In [23]:
outerlist = []
while len(outerlist) < k:
    outerlist.append([])

In [24]:
for x in docs:
    Y = vectorizer.transform([x])
    prediction = kmeans.predict(Y)
    outerlist[int(prediction)].append(x)

order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()


index = 0


for iter in outerlist:
    print("DOCUMENTS GROUP %d" % index)
    print(iter[:1], sep=', ')
    print(" ")
    
    print("-----------------")


    print("GROUP DESCRIPTIVE KEYWORDS" )
    for ind in order_centroids[index, :10]:
        print(' %s' % terms[ind]),
    index = index + 1
    print("-----------------")



DOCUMENTS GROUP 0
['introduction the coronavirus disease  covid caused by severe acute respiratory syndrome coronavirus  sarscov is emerging as a major infectious disease epidemic globally initially detected in a cluster of patients with unexplained pneumonia in wuhan hubei province of china in early december  sarscov rapidly spread not only within china but also around the globe within just three months as of mar    confirmed cases and  deaths have been reported in  countries   although the natural reservoir of sarscov is still unknown early confirmed cases are strongly associated with exposures to wild animals in the huanan seafood wholesale market   and sustained humantohuman transmission is observed particularly among close contacts   due to the surging number and rapid spread of covid world health organization who has increased the risk assessment of covid to very high at the global level on feb  as one of the most developed and commercialized cities in china shenzhen is the large

In [None]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
    
print(model_name, ":\n")
print("Accuracy Score: ", '{:,.3f}'.format(float(accuracy_score(y_test, y_pred)) * 100), "%")
print("F1 score: ", '{:,.3f}'.format(float(f1_score(test, pred, average='micro')) * 100), "%")