# Visualization the Relation
- 11/26/20 Visualization the relations using plotly. This notebook is based on main_kch_relation_extraction-vis

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from covid.models.relation.extraction import RelationExtractor

import ast
import altair as alt
import pandas as pd
import numpy as np
import yaml

alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

# Load the yml class-subclass-keyword data structure 
- 11/29/20 This is for Panayoitis to create the UI search bar for users
- the data structure will be something like this:
    - kidney_disease
        - keyword1
        - keyword2
        - etc
    - risk_factor
        - keyword1
        - keyword2
        - etc
    - treatment_and_vaccine
        - keyword1
        - keyword2
        - etc

In [17]:
data_path = '../covid/models/paperclassifier/Davids_interest_meshed.yaml'
with open(data_path) as f:
    data_yml = yaml.load(f, Loader=yaml.FullLoader)
    
# reorrganize the information
data_class_subclass = {}
classes = ['kidney_disease', 'risk_factor', 'treatment_and_vaccine']
for c in classes:
    data_class_subclass[c] = data_yml[c]['%s_common_name' %c]['kw']

data_class_subclass

{'kidney_disease': ['ckd',
  'renal insufficiency, chronic',
  'chronic renal insufficiencies',
  'renal insufficiencies, chronic',
  'chronic renal insufficiency',
  'kidney insufficiency, chronic',
  'chronic kidney insufficiency',
  'chronic kidney insufficiencies',
  'kidney insufficiencies, chronic',
  'chronic kidney diseases',
  'chronic kidney disease',
  'disease, chronic kidney',
  'diseases, chronic kidney',
  'kidney disease, chronic',
  'kidney diseases, chronic',
  'chronic renal diseases',
  'chronic renal disease',
  'disease, chronic renal',
  'diseases, chronic renal',
  'renal disease, chronic',
  'renal diseases, chronic',
  'aki',
  'acute kidney injury',
  'acute kidney injuries',
  'kidney injuries, acute',
  'kidney injury, acute',
  'acute renal injury',
  'acute renal injuries',
  'renal injuries, acute',
  'renal injury, acute',
  'renal insufficiency, acute',
  'acute renal insufficiencies',
  'renal insufficiencies, acute',
  'acute renal insufficiency',
  

# Read the relation data file

In [21]:
# load the data
df_r = pd.read_csv('../data/paperclassifier/classified_merged_covid_relation.csv')
df_r.head(3)

Unnamed: 0.1,Unnamed: 0,sha,title,abstract,publish_time,affiliations,location,text,risk_factor,diagnostic,...,clinical_diagnosis,genetic_diagnosis,treatment_and_vaccine_common_name,treatment,outcome_common_name,clinical_outcome,keywords,affiliations_country,location_country,relations
0,0,0104f6ceccf92ae8567a0102f89cbb976969a774,Association of HLA class I with severe acute r...,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,,,,0,1,...,0,0,0,0,0,0,fever,,,"[('coronavirus', 'fever', ('part of', 0.206526..."
1,1,5b68a553a7cbbea13472721cd1ad617d42b40c26,A double epidemic model for the SARS propagation,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,,,,1,0,...,0,0,0,0,0,0,weather,,,"[('coronavirus', 'weather', ('part of', 0.7285..."
2,2,3ed670f60a7be2e3e2a991ea8af1fdd5fa5e2b2c,Cloaked similarity between HIV-1 and SARS-CoV ...,BACKGROUND: Severe acute respiratory syndrome ...,2003-09-21,,,,0,0,...,0,0,0,0,0,0,0,,,


## Visualization

As of 11/26/20, I am planning to generate multiple plots for user to visually understand what the relation data table represents. Here are couple of figure to be plotted

1. (Not a plot) Filter the data
    - Select meaningful relationship; filter the rest; may choose only a few interpretable relationship
        - "has part" = "part of" = "coronavirus is related to ..."
        - (10/27/20 ABORT) "said to be the same as" = "instance of" = "corvonavirus is ..."
    - Choose the paper that is published after covid breakout, i.e., 2020 Feb
2. Plot x along time by month (since covid breakout). X can be
    - the number of a relationship (e.g., part of) with an entity (e.g., RNA)
3. Plot the summary count of different relationships with respect to the entity. 


In [35]:
from covid.models.relation import vis as rvis
import altair as alt
import plotly.express as px


# Preprocess df
- This is the overall preprocessing step

In [30]:
# data preparation
df_new = rvis.preprocess_df(df_r)

## Overview plot
Plot the number of identified relationship along time



In [31]:
# =======================================================
# Plotly
# =======================================================
fig = px.scatter(df_new, x='publish_time', y='probability', hover_name=df_new['keyword']
                )
fig.update_layout(
    title='Strength of discovered relationship along paper publication month',
    xaxis_title="Publish Time",
    yaxis_title="Probability",
    font=dict(
        family="Courier New, monospace",
        size=18,
    )
)
fig.show()

## keyword-specific plot
- Choose a particular keyword
- Each relation will have a line plot
- Aggregated probability per month

In [33]:
# =======================================================
# Data Preparation 
# =======================================================
# Define what keyword we are interested to see the trend
kw = 'fever'
relations = ['is related to']
df_grps = rvis.preproces_for_kwspecific_plot(df_new, kw=kw)

# =======================================================
# Plotly
# =======================================================
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatter(
        x=df_grps['publish_month'],
        y=df_grps['proba_mean'],
        error_y=dict(
            type='data', # value of error bar given in data coordinates
            array=df_grps['proba_stderr'],
            visible=True)
    ))
fig.update_layout(
    yaxis=dict(range=[0, 1]),
    title="coronavirus - '%s' relationship" %kw,
    xaxis_title="Month",
    yaxis_title="Strength",
    font=dict(
        family="Courier New, monospace",
        size=18,
    )
)
fig.show()

2020-11-29 13:45:13,722 - numexpr.utils - INFO - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-11-29 13:45:13,722 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


## Get top keywords for each relationship 

In [34]:
# relations = ['is related to', 'is']

# # loop
# for r in relations:
#     df_sg = df_new.loc[df_new['relation'] == r]
#     df_sg = df_sg.groupby('keyword')['probability'].mean().sort_values(ascending=False).to_frame()
#     df_sg.columns = ['strength']
    
#     # print
#     print('The relationship is:', r)
#     print(df_sg.head(10))
#     print()


## Overview plot for several interesting keyword relationship
- Look at the strength of the relationships along time
- https://altair-viz.github.io/gallery/natural_disasters.html

In [36]:
# =======================================================
# Data Preparation 
# =======================================================
kw_interest = ['sore throat', 'fatigue', 'fever', 'upper respiratory infection', 'lung capacity',
              'hospitalization', 'dry cough', 'sneezing', 'death', 'shortness of breath']
df_new_p = rvis.preprocess_for_multiple_kw_visualization(df_new, kw_interest=kw_interest)

# =======================================================
# Plotly plot
# =======================================================
import plotly.express as px
df = px.data.iris()
fig = px.scatter(df_new_p, x="publish_time", y="probability", color="keyword",
                 size='probability')
fig.update_layout(
    yaxis=dict(range=[0, 1.1]),
    title="coronavirus - keyword relationship",
    xaxis_title="Publish Time",
    yaxis_title="Strength",
    font=dict(
        family="Courier New, monospace",
        size=18,
    )
)
fig.show()