In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

In [2]:
from covid.models.paperclassifier.paperclassifier import PaperClassifier

# Load the paper data

In [3]:
# load the data
data_dir = '../data/'
data_f = 'metadata.csv'
df = pd.read_csv('%s/%s' %(data_dir, data_f))

# print
print(df.shape)
df.head()

(44220, 15)


Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license


In [4]:
# Get a small samples and work on it first; can remove later
n = 1000
n = df.shape[0]
df = df.head(n)

# Load the paper classifier

In [19]:
pc = PaperClassifier(km_path='../covid/models/paperclassifier/interest.yaml')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# Get the knowledge map
km = pc.get_km()

# Get the infomration in the knowledge map
classes, subclasses, subclasses_kws, kws_all = pc.get_km_info()

# print
print("Classes:", classes)
print("Subclasses:", list(subclasses.keys()))
print("Base keywords:", kws_all)

Classes: ['risk_factor', 'diagnostic', 'treatment_and_vaccine', 'outcome']
Subclasses: ['risk_factor_common_name', 'gender', 'age', 'disease_comorbidity', 'smoking', 'exercise', 'occupation', 'weather', 'diagnostic_common_name', 'symptom', 'imaging_diagnosis', 'clinical_diagnosis', 'genetic_diagnosis', 'treatment_and_vaccine_common_name', 'treatment', 'outcome_common_name', 'clinical_outcome']
Base keywords: ['risk factor', 'manlike', 'male', 'virile', 'manful', 'manly', 'male person', 'female', 'female person', 'distaff', 'sex', 'gender', 'grammatical gender', 'sexuality', 'age', 'disease comorbidity', 'comorbidity', 'smoking', 'smoke', 'tobacco', 'exercise', 'exercising', 'workout', 'occupation', 'employment status', 'weather', 'temperature', 'symptom', 'fever', 'dry cough', 'lose of smell', 'difficult to breath', 'hard to breath', 'shortness of breath', 'headaches', 'aches and pains', 'sore throat', 'fatigue', 'diarrhea', 'running nose', 'sneezing', 'cough', 'ct', 'ct imaging', 'res

# Classification
- There are four classes based on the interest.yaml:
    - risk_factor
    - diagnostic
    - treatment_and_vaccine
    - outcome
- Each class will have subclasses, e.g., risk_factor: [gender, age, ...]. Refer to interest.yaml for more information
- The classification will classify each paper into classes and subclasses

In [7]:
# preprocess the dataframe text
df_p = pc.preprocess(df)

# classification
df_p = pc.classify_all(df_p)


# check
print(df.shape)
print(df_p.shape)
df_p.head(3)

HBox(children=(FloatProgress(value=0.0, max=35657.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35657.0), HTML(value='')))


(44220, 15)
(35657, 38)


Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,...,diagnostic_common_name,symptom,imaging_diagnosis,clinical_diagnosis,genetic_diagnosis,treatment_and_vaccine_common_name,treatment,outcome_common_name,clinical_outcome,keywords
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",...,0,0,0,1,1,0,0,0,0,"virus infection,dna"
1,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",...,0,0,0,0,0,0,0,0,0,female
2,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",...,0,0,0,0,0,0,0,0,1,"age,death"


### save

In [None]:
# save the dataframe
rdir = '../data/'
fname = 'metadata_step1_paperclassified.csv'
df_p.to_csv('%s/%s' %(rdir, fname))

### Return a list of papers ids that fist to certain classes and subclasses

In [21]:
df_tmp = pc.query_paper_by_class(df_p, ['symptom', 'genetic_diagnosis', 'clinical_diagnosis'])
print(df_tmp.shape)
df_tmp.head()

(43, 38)


Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,...,diagnostic_common_name,symptom,imaging_diagnosis,clinical_diagnosis,genetic_diagnosis,treatment_and_vaccine_common_name,treatment,outcome_common_name,clinical_outcome,keywords
690,9f052309c5fe73c37790a81b6f9f6981b2e40fee,Elsevier,Detection of related positive-strand RNA virus...,10.1016/0168-1702(95)00093-3,,8837898.0,els-covid,Abstract A set of degenerate sense and antisen...,1995-12-31,"Chen, Zongyu; Plagemann, Peter G.W.",...,0,1,0,1,1,0,0,0,0,"fever,porcine reproductive and respiratory syn..."
1277,df2bfbfaa6b85b062822c22e7fdf9adb9b67a486,Elsevier,Arteriviruses,10.1016/B978-012374410-4.00537-9,,,els-covid,"The family Arteriviridae, which consists of fo...",2008-12-31,"Brinton, M.A.; Snijder, E.J.",...,0,1,0,1,1,0,0,0,0,"fever,porcine reproductive and respiratory syn..."
4775,548c8db18a49ac9cff5c210765be6849541fcd8c,Elsevier,Human infections with the emerging avian influ...,10.1016/S0140-6736(13)60903-4,,,els-covid,Summary Background Human infection with avian ...,2013-06-07,"Chen, Yu; Liang, Weifeng; Yang, Shigui; Wu, Na...",...,0,1,0,1,1,0,0,0,0,"age,fever,virus infection,rna,dna"
5117,0329e0f2e3b8ab0eabe2afe0e5214cc88c61080e,Elsevier,"Favipiravir, an anti-influenza drug against li...",10.1016/j.pharmthera.2020.107512,,,els-covid,Abstract Favipiravir has been developed as an ...,2020-02-22,"Shiraki, Kimiyasu; Daikoku, Tohru",...,0,1,0,1,1,1,0,0,0,"fever,virus infection,rna,treatment"
5581,04f29ac6c30bbd12e48aefe4078f8f1aa64c71bf,Elsevier,A multiplex RT-PCR assay for rapid and simulta...,10.1016/j.jviromet.2019.04.001,,30951787.0,els-covid,Abstract A multiplex reverse transcription pol...,2019-07-31,"Zhao, Yan; Liu, Feifei; Li, Qingmei; Wu, Mengf...",...,0,1,0,1,1,0,0,0,0,"fever,diarrhea,porcine reproductive and respir..."


### Analyze the classification
- Use Altair to plot some statistics

In [None]:
import altair as alt

# disable the Altair max row record allowance (max=5000 rows)
alt.data_transformers.disable_max_rows()

In [None]:
chart = alt.vconcat(data=df_p).configure_axis(
    labelFontSize=14,
    titleFontSize=14
).configure_title(fontSize=20)

achart = alt.Chart(df_p).mark_bar().encode(
    x='covid_related',
    y='count()'
).properties(title='Number of papers related to covid-19',
             width=300, height=300)
chart &= achart
chart

In [None]:
# count each classes
df_tmp = df_p.loc[df_p['covid_related']==1]
df_tmp = pd.DataFrame(df_tmp[classes].sum())
df_tmp.columns = ['count']
df_tmp['class'] = df_tmp.index

# plot
chart = alt.vconcat(data=df_tmp).configure_axis(
    labelFontSize=14,
    titleFontSize=14
).configure_title(fontSize=20)

achart = alt.Chart(df_tmp).mark_bar().encode(
    x='class',
    y='count'
).properties(title='Number of relevant papers in each class',
             width=300, height=300)
chart &= achart
chart

In [None]:
# count each classes
df_tmp = df_p.loc[df_p['covid_related']==1]
df_tmp = pd.DataFrame(df_tmp[subclasses].sum())
df_tmp.columns = ['count']
df_tmp['class'] = df_tmp.index

# plot
chart = alt.vconcat(data=df_tmp).configure_axis(
    labelFontSize=14,
    titleFontSize=14
).configure_title(fontSize=20)

achart = alt.Chart(df_tmp).mark_bar().encode(
    x='class',
    y='count'
).properties(title='Number of relevant papers in each subclass',
             width=600, height=300)
chart &= achart
chart