### Install Dependencies

In [None]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


### Entities

In [None]:
import pandas as pd
import plotly.express as px
from IPython.display import HTML

entity_file = "graph_data/entities.tsv"

# Read the TSV file into a DataFrame
df = pd.read_csv(entity_file, sep='\t', dtype=str)

In [None]:
# Group the data by 'label' and 'resource' and count the rows
grouped_df_by_label = df.groupby(['label']).size().reset_index(name='count')

# Create the bar chart using Plotly Express
fig1 = px.bar(grouped_df_by_label, x='label', y='count', title='Count of Rows by Label and Resource', text="count")
fig1.update_traces(texttemplate='%{text}', textposition='outside')

# Show the plot
HTML(fig1.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Group the data by 'label' and 'resource' and count the rows
grouped_df = df.groupby(['label', 'resource']).size().reset_index(name='count')

# Create the bar chart using Plotly Express
fig2 = px.bar(grouped_df, x='label', y='count', color='resource', title='Count of Rows by Label and Resource')

# Show the plot
HTML(fig2.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Create the bar chart using Plotly Express
fig3 = px.bar(grouped_df, x='resource', y='count', color='label', title='Count of Rows by Label and Resource')

# Show the plot
HTML(fig3.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
import plotly.graph_objects as go
from IPython.display import display

# Group the data by 'label' and 'resource' and count the rows
grouped_df = df.groupby(['label', 'resource']).size().reset_index(name='count')

# Create a pie chart for each label
unique_labels = grouped_df['label'].unique()


for label in unique_labels:
  data = grouped_df[grouped_df['label'] == label]
  fig = go.Figure(data=[go.Pie(labels=data['resource'], values=data['count'])])
  # Calculate the total
  total = sum(fig.data[0].values)

  # Add total text
  total_annotation = fig.add_annotation(
      text=f'{total}',
      x=0.5,
      y=-0.1,
      font=dict(size=20),
      showarrow=False
  )
  fig.update_layout(title_text=f'Count of Rows for Label: {label}')
  fig.show()



IPython.utils.traitlets has moved to a top-level traitlets package.



### Relations

In [None]:
import pandas as pd

relation_file = "graph_data/relations.tsv"

# Read the TSV file into a DataFrame
df = pd.read_csv(relation_file, sep='\t', dtype=str, on_bad_lines='warn')

In [None]:
import plotly.express as px
from IPython.display import HTML

df["source_target"] = df["source_type"] + ":" + df["target_type"]

# Group the data by 'label' and 'resource' and count the rows
grouped_df = df.groupby(['source_target', "resource"]).size().reset_index(name='count')

# Create the bar chart using Plotly Express
fig1 = px.bar(grouped_df, x='source_target', y='count', color='resource', title='Count of Rows by Source-Target and Resource')

# Show the plot
HTML(fig1.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Create the bar chart using Plotly Express
fig2 = px.bar(grouped_df, x='resource', y='count', color='source_target', title='Count of Rows by Source-Target and Resource')

# Show the plot
HTML(fig2.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Group the data by 'source_target' and 'resource' and count the rows
grouped_df_by_label = df.groupby(['source_target']).size().reset_index(name='count')

# Create the bar chart using Plotly Express
fig3 = px.bar(grouped_df, x='source_target', y='count', color="source_target", title='Count of Rows by Source-Target and Resource')

# Show the plot
HTML(fig3.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
import plotly.graph_objects as go
from IPython.display import display

# Group the data by 'label' and 'resource' and count the rows
grouped_df = df.groupby(['source_target', 'resource']).size().reset_index(name='count')

# Create a pie chart for each label
unique_labels = grouped_df['source_target'].unique()


for label in unique_labels:
  data = grouped_df[grouped_df['source_target'] == label]
  fig = go.Figure(data=[go.Pie(labels=data['resource'], values=data['count'])])
  # Calculate the total
  total = sum(fig.data[0].values)

  # Add total text
  total_annotation = fig.add_annotation(
      text=f'{total}',
      x=0.5,
      y=-0.1,
      font=dict(size=20),
      showarrow=False
  )
  fig.update_layout(title_text=f'Count of Rows for Label: {label}')
  fig.show()


In [None]:
df["source_target"] = df['source_type'] + ":" + df["target_type"]

# Add a new column with sorted source_type and target_type
df["sorted_source_target"] = df.apply(lambda row: ":".join(sorted([row["source_type"], row["target_type"]])), axis=1)

# Group the data by 'sorted_source_target' and 'resource' and count the rows
grouped_df = df.groupby(['sorted_source_target', 'resource']).size().reset_index(name='count')

sorted_source_target_order = sorted(df['sorted_source_target'].unique())

# Create the bar chart using Plotly Express
fig1 = px.bar(grouped_df, x='sorted_source_target', y='count', color='resource',
              title='Count of Rows by Source-Target and Resource',
              category_orders={'sorted_source_target': sorted_source_target_order})

# Show the plot
HTML(fig1.to_html())

Output hidden; open in https://colab.research.google.com to view.

### Annotate the drugs from Publications & Survey

In [None]:
import pandas as pd
import plotly.express as px
from IPython.display import HTML

entity_file = "graph_data/entities.tsv"

# Read the TSV file into a DataFrame
df = pd.read_csv(entity_file, sep='\t', dtype=str)

In [None]:
drugs = pd.read_excel("./graph_data/drug_symptoms.xlsx", sheet_name="deduplicated_drugs")
drugs

Unnamed: 0,DrugBankID,DrugName,Category
0,DB00001,Lepirudin,
1,DB00005,Etanercept,
2,DB00006,Bivalirudin,
3,DB00009,Alteplase,
4,DB00013,Urokinase,
...,...,...,...
808,DB17289,L-arginine +/- L-citrulline,Vasodilating
809,DB17508,Hafnium oxide,
810,DB17614,Ergothioneine,Mushroom Derivatives
811,DB17735,Ajoene,


In [None]:
# Create a list of IDs with the "DrugBank:" prefix
drugbank_ids = ["DrugBank:" + drug_id for drug_id in drugs['DrugBankID']]

# Use 'isin' to filter the DataFrame based on the condition
filtered_df = df[df["id"].isin(drugbank_ids)]
filtered_df

Unnamed: 0,id,name,label,resource,description,synonyms,pmids,taxid,xrefs
136042,DrugBank:DB00001,Lepirudin,Compound,DrugBank,,Lepirudin Recombinant|Lepirudin|Hirudin varian...,,,CHEMBL:CHEMBL1201666|MESH:C083544|DrugBank:DB0...
136046,DrugBank:DB00005,Etanercept,Compound,DrugBank,,CD120b|Etanercept|Enbrel Sureclick|Enbrel|Tumo...,,,DrugBank:DB00005|UMLS:C0717758|CHEMBL:CHEMBL12...
136047,DrugBank:DB00006,Bivalirudin,Compound,DrugBank,"A synthetic peptide of 20 amino acids, compris...",Bivalirudina|bivalirudinum|Angiomax rtu|bivali...,29461391|29345444|29345439|29345440|29345441|2...,,DrugBank:DB00006|MESH:C074619|UMLS:C0168273|CH...
136050,DrugBank:DB00009,Alteplase,Compound,DrugBank,,Activase|Alteplasa|RT-PA|Cathflo activase|Tiss...,,,CHEMBL:CHEMBL1201593|DrugBank:DB00009
136054,DrugBank:DB00013,Urokinase,Compound,DrugBank,,Urokinase-type plasminogen activator precursor...,,,DrugBank:DB00013|UMLS:C0042071|CHEMBL:CHEMBL12...
...,...,...,...,...,...,...,...,...,...
403633,DrugBank:DB14539,Hydrocortisone acetate,Compound,DrugBank,,21-O-acetylcortisol|Cortisol 21-acetate|Hydroc...,,,
403634,DrugBank:DB14540,Hydrocortisone butyrate,Compound,DrugBank,,"11β,21-dihydroxy-17α-butyryloxy-4-pregnene-3,2...",,,
403635,DrugBank:DB14544,Hydrocortisone valerate,Compound,DrugBank,,Cortisol 17-valerate|Hydrocortisone 17-valerat...,,,
403636,DrugBank:DB14596,Loteprednol etabonate,Compound,DrugBank,,Loteprednol etabonate,,,


In [None]:
missed_ids = list(set(drugbank_ids) - set(filtered_df['id']))
missed_ids = [id.split(":")[1] for id in missed_ids]
missed_df = drugs[drugs['DrugBankID'].isin(missed_ids)]
missed_df

Unnamed: 0,DrugBankID,DrugName,Category
792,DB16425,Rintatolimod,
793,DB16481,Dociparstat sodium,
795,DB16629,Serdexmethylphenidate,
797,DB16701,Plasminogen,
800,DB16824,Cepharanthine,
801,DB16847,Olprinone,
802,DB16854,Caryophyllene,
803,DB16865,Rosmarinic acid,
804,DB16886,Tanshinone I,
805,DB16921,Oxaloacetic acid,
