In [1]:
import pandas as pd
import requests

## Interaction Novelty

#### Load Data

In [2]:
df = pd.read_excel('data/final_results_test3.xlsx').drop(labels='Unnamed: 0',axis=1)
df.head(5)

Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,37726279,venetoclax,ABCC1,YES,INHIBITING,Genetic and pharmacologic ABCC1 inactivation p...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:1747556,venetoclax,80
1,37726279,glutathione,ABCC1,YES,ACTIVATING,Consistent with ABCC1-specific export of gluta...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:4890,glutathione,80
2,37004989,Kynurenine,AhR,YES,ACTIVATING,"An endogenous AhR ligand, kynurenine (Kyn), wa...",normalize.gene.hgnc:348,AHR,100,normalize.therapy.drugbank:DB02070,Kynurenine,80
3,33932119,ONC201,AKT,YES,INHIBITING,"The compensatory, pro-survival PI3K/AKT/mTOR p...",normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80
4,26884600,ONC201,AKT,YES,INHIBITING,ONC201 (also called TIC10) is a small molecule...,normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80


In [3]:
df[['drug_name','gene_name','pmid']]

Unnamed: 0,drug_name,gene_name,pmid
0,venetoclax,ABCC1,37726279
1,glutathione,ABCC1,37726279
2,Kynurenine,AhR,37004989
3,ONC201,AKT,33932119
4,ONC201,AKT,26884600
...,...,...,...
132,ONC201,TRAIL,26884600
133,melphalan,Trip13,37942576
134,panobinostat,Trip13,37942576
135,galangin,TRPV1,39047882


#### GraphQL

In [4]:

GQL_URL = "https://dgidb.org/api/graphql"

GQL_QUERY = """
query GeneInteractions($genes: [String!]!) {
  genes(names: $genes) {
    nodes {
      name
      conceptId
      interactions {
        drug {
          name
          conceptId
          approved
        }
        gene {
          name
          conceptId
          longName
        }
        interactionScore
        interactionTypes { type directionality }
        interactionAttributes { name value }
        publications { pmid }
        sources { sourceDbName }
      }
    }
  }
}
"""

def fetch_gene_interactions(genes):
    payload = {"query": GQL_QUERY, "variables": {"genes": genes}}
    headers = {"Content-Type": "application/json", "Accept": "application/json"}
    r = requests.post(GQL_URL, json=payload, headers=headers)
    r.raise_for_status()
    return r.json()


In [5]:
genes = list(df['gene_label'].unique())
genes[0:5]

['ABCC1', 'AHR', 'AKT1', 'AKT2', 'PRKAA2']

In [6]:
response = fetch_gene_interactions(genes)

In [7]:
genes_data = response.get("data", {}).get("genes", {}).get("nodes", [])

rows = []
for gene_node in genes_data:
    gene_name = gene_node.get("name")
    gene_id = gene_node.get("conceptId")
    for interaction in gene_node.get("interactions", []):
        drug = interaction.get("drug") or {}
        igene = interaction.get("gene") or {}
        interaction_types = interaction.get("interactionTypes") or []
        interaction_attributes = interaction.get("interactionAttributes") or []
        publications = interaction.get("publications") or []
        sources = interaction.get("sources") or []

        rows.append({
            "gene_name": gene_name,
            "gene_concept_id": gene_id,
            "interaction_gene_name": igene.get("name"),
            "interaction_gene_concept_id": igene.get("conceptId"),
            "interaction_gene_long_name": igene.get("longName"),
            "drug_name": drug.get("name"),
            "drug_concept_id": drug.get("conceptId"),
            "drug_approved": drug.get("approved"),
            "interaction_score": interaction.get("interactionScore"),
            "interaction_types": ";".join(
                f"{t.get('type')}({t.get('directionality')})"
                for t in interaction_types
                if t
            ),
            "interaction_attributes": ";".join(
                f"{a.get('name')}={a.get('value')}"
                for a in interaction_attributes
                if a
            ),
            "pmids": ";".join(str(p.get("pmid")) for p in publications if p),
            "sources": ";".join(s.get("sourceDbName") for s in sources if s),
        })

data = pd.DataFrame(rows)
data.head()

Unnamed: 0,gene_name,gene_concept_id,interaction_gene_name,interaction_gene_concept_id,interaction_gene_long_name,drug_name,drug_concept_id,drug_approved,interaction_score,interaction_types,interaction_attributes,pmids,sources
0,MYD88,hgnc:7562,MYD88,hgnc:7562,MYD88 innate immune signal transduction adaptor,ZANUBRUTINIB,rxcui:2262435,True,4.350317,,,,PharmGKB
1,MYD88,hgnc:7562,MYD88,hgnc:7562,MYD88 innate immune signal transduction adaptor,FENTANYL CITRATE,rxcui:142436,True,1.160084,,,26332828.0,PharmGKB
2,MYD88,hgnc:7562,MYD88,hgnc:7562,MYD88 innate immune signal transduction adaptor,IBRUTINIB,rxcui:1442981,True,1.535406,,Alteration=MYD88:L265P,,CGI;CIViC;PharmGKB
3,MAP2K7,hgnc:6847,MAP2K7,hgnc:6847,mitogen-activated protein kinase kinase 7,DABRAFENIB,rxcui:1424911,True,0.193347,inhibitor(INHIBITORY),Clinical Trial ID=NCT01701037;Cancer Type=Mela...,,MyCancerGenomeClinicalTrial
4,MAP2K7,hgnc:6847,MAP2K7,hgnc:6847,mitogen-activated protein kinase kinase 7,SELUMETINIB,rxcui:2289380,True,0.158193,inhibitor(INHIBITORY),Clinical Trial ID=NCT01586624;Cancer Type=Lung...,,MyCancerGenomeClinicalTrial


#### Assess Pairs

In [8]:
ai_interactions = df[['drug_name','gene_name','pmid']]
ai_interactions.head(5)

Unnamed: 0,drug_name,gene_name,pmid
0,venetoclax,ABCC1,37726279
1,glutathione,ABCC1,37726279
2,Kynurenine,AhR,37004989
3,ONC201,AKT,33932119
4,ONC201,AKT,26884600


In [9]:
ai_interactions['drug_name'] = ai_interactions['drug_name'].apply(lambda x: x.lower())
ai_interactions['gene_name'] = ai_interactions['gene_name'].apply(lambda x: x.lower())
data['drug_name'] = data['drug_name'].apply(lambda x: x.lower())
data['gene_name'] = data['gene_name'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ai_interactions['drug_name'] = ai_interactions['drug_name'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ai_interactions['gene_name'] = ai_interactions['gene_name'].apply(lambda x: x.lower())


In [10]:
ai_interactions['novel?'] = 'Not Evaluated'
for idx, row in ai_interactions.iterrows():
    tdf = data[data['gene_name']==row['gene_name']].reset_index()
    tdf = tdf[tdf['drug_name']==row['drug_name']].reset_index()
    if len(tdf) > 0:
        ai_interactions.at[idx, 'novel?'] = False
    if len(tdf) == 0:
        ai_interactions.at[idx, 'novel?'] = True

ai_interactions['novel?'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ai_interactions['novel?'] = 'Not Evaluated'


novel?
True     121
False     16
Name: count, dtype: int64

In [11]:
ai_interactions

Unnamed: 0,drug_name,gene_name,pmid,novel?
0,venetoclax,abcc1,37726279,True
1,glutathione,abcc1,37726279,True
2,kynurenine,ahr,37004989,True
3,onc201,akt,33932119,True
4,onc201,akt,26884600,True
...,...,...,...,...
132,onc201,trail,26884600,True
133,melphalan,trip13,37942576,True
134,panobinostat,trip13,37942576,True
135,galangin,trpv1,39047882,True


#### Graph

In [12]:
import plotly.graph_objects as go

ai_interactions["novel?"] = ai_interactions["novel?"].astype(bool)

novel_true = ai_interactions["novel?"].sum()
novel_false = (~ai_interactions["novel?"]).sum()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=[novel_false],
    y=["Interactions"],
    name="Already Known",
    orientation="h",
    marker=dict(color="firebrick"),
    text=[novel_false],
    textposition="inside",
    insidetextanchor="middle",
    textfont=dict(color="white", size=14)
))

fig.add_trace(go.Bar(
    x=[novel_true],
    y=["Interactions"],
    name="Novel",
    orientation="h",
    marker=dict(color="royalblue"),
    text=[novel_true],
    textposition="inside",
    insidetextanchor="middle",
    textfont=dict(color="white", size=14)
))

fig.update_layout(
    barmode="stack",
    template="simple_white",
    title=dict(
        text="Novelty of AI Curated Interactions",
        x=0.5,
        font=dict(size=20)
    ),
    xaxis=dict(
        title="Number of Interactions",
        showgrid=True,
        gridcolor="lightgray",
        zeroline=False,
        linecolor="black",
        mirror=False
    ),
    yaxis=dict(
        showgrid=False,
        linecolor="black",
        mirror=False
    ),
    plot_bgcolor="white",
    paper_bgcolor="white",
    bargap=0.05,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5,
        font=dict(size=14)
    )
)

fig.update_traces(marker_line_width=0)

# fig.write_image("graphs/novel_interactions-d1.png", scale=3, width=800, height=300)

fig.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ai_interactions["novel?"] = ai_interactions["novel?"].astype(bool)


## PMID

In [54]:
df

Unnamed: 0,id,pmid,citation,created_at,updated_at


In [None]:
import psycopg2
import pandas as pd
import numpy as np

# 1) Prepare PMIDs
pmids = pd.Series(df['pmid']).dropna().astype(str).unique().tolist()

# 2) Connect
conn = psycopg2.connect(
    dbname="dgidb_2025",
    user="mjc014",
    password="",
    host="localhost",
    port="5432"
)

# 3) Query: cast column -> text so it matches the text[] param
query = """
SELECT *
FROM public.publications
WHERE pmid::text = ANY(%s)
ORDER BY id ASC;
"""

df = pd.read_sql(query, conn, params=(pmids,))
conn.close()

print(f"Found {len(df)} PMIDs already in DGIdb")
display(df.head())

# 4) Novelty %
found_pmids = set(df['pmid'].astype(str).unique())
all_pmids   = set(pmids)
novel_pmids = sorted(all_pmids - found_pmids)

novelty_pct = 100.0 * (len(novel_pmids) / max(1, len(all_pmids)))
print(f"Novel PMIDs: {len(novel_pmids)} / {len(all_pmids)} = {novelty_pct:.2f}%")


Found 0 PMIDs already in DGIdb



pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



Unnamed: 0,id,pmid,citation,created_at,updated_at


Novel PMIDs: 83 / 83 = 100.00%


### Quantify support for existing interactions

In [15]:
ai_interactions[ai_interactions['novel?']==False]

Unnamed: 0,drug_name,gene_name,pmid,novel?
11,enzalutamide,ar,32351687,False
14,enzalutamide,ar,35302608,False
25,venetoclax,bcl2,28663582,False
27,venetoclax,bcl2,31123034,False
28,venetoclax,bcl2,38053201,False
38,ibrutinib,btk,28663582,False
39,ibrutinib,btk,36719376,False
40,pirtobrutinib,btk,36719376,False
50,lenalidomide,crbn,28529032,False
58,onc201,drd2,33932119,False


In [19]:
import pandas as pd
import requests
from collections import defaultdict
from typing import Any

DGIDB_GQL = "https://dgidb.org/api/graphql"

def gql(query: str, variables: dict | None = None, timeout: int = 60) -> dict:
    r = requests.post(
        DGIDB_GQL,
        json={"query": query, "variables": variables or {}},
        headers={"Content-Type": "application/json"},
        timeout=timeout,
    )
    r.raise_for_status()
    payload = r.json()
    if payload.get("errors"):
        raise RuntimeError(payload["errors"])
    return payload["data"]

def as_str(x: Any) -> str:
    """Convert DGIdb field values (str/int/None) into a clean string."""
    if x is None:
        return ""
    if isinstance(x, str):
        return x.strip()
    # ints (pmid) and other primitives
    return str(x).strip()


GENES_WITH_INTERACTIONS_Q1 = """
query($names: [String!]!) {
  genes(names: $names) {
    edges {
      node {
        name
        interactions {
          drug { name }
          gene { name }
          interactionClaims {
            source { sourceDbName }
            publications { pmid }
          }
        }
      }
    }
  }
}
"""

GENES_WITH_INTERACTIONS_Q2 = """
query($names: [String!]!) {
  genes(names: $names) {
    edges {
      node {
        name
        interactions {
          drug { name }
          gene { name }
          sources { sourceDbName }
          publications { pmid }
        }
      }
    }
  }
}
"""

def _collect_evidence_from_interaction(inter: dict) -> tuple[set[str], set[str]]:
    sources: set[str] = set()
    pmids: set[str] = set()

    # Shape A: interactionClaims -> source/publications
    claims = inter.get("interactionClaims") or []
    if claims:
        for claim in claims:
            src = as_str(((claim.get("source") or {}).get("sourceDbName")))
            if src:
                sources.add(src)

            for pub in claim.get("publications") or []:
                pmid = as_str(pub.get("pmid"))
                if pmid:
                    pmids.add(pmid)

        return sources, pmids

    # Shape B: sources/publications directly on interaction
    for s in inter.get("sources") or []:
        src = as_str(s.get("sourceDbName"))
        if src:
            sources.add(src)

    for pub in inter.get("publications") or []:
        pmid = as_str(pub.get("pmid"))
        if pmid:
            pmids.add(pmid)

    return sources, pmids


def fetch_dgidb_sources_pmids(pairs_df: pd.DataFrame) -> pd.DataFrame:
    pairs = (
        pairs_df[["drug_name", "gene_name"]]
        .drop_duplicates()
        .assign(
            drug_name=lambda d: d["drug_name"].astype(str).str.strip(),
            gene_name=lambda d: d["gene_name"].astype(str).str.strip().str.upper(),
        )
    )

    genes = sorted(pairs["gene_name"].unique().tolist())

    # Try preferred query, then fallback if schema doesn't have those fields
    try:
        data = gql(GENES_WITH_INTERACTIONS_Q1, {"names": genes})
    except RuntimeError as e:
        if "undefinedField" not in str(e):
            raise
        data = gql(GENES_WITH_INTERACTIONS_Q2, {"names": genes})

    evidence = defaultdict(lambda: {"sources": set(), "pmids": set()})

    gene_edges = (data.get("genes") or {}).get("edges") or []
    for edge in gene_edges:
        gene_node = edge.get("node") or {}
        for inter in gene_node.get("interactions") or []:
            drug = as_str(((inter.get("drug") or {}).get("name")))
            gene = as_str(((inter.get("gene") or {}).get("name"))).upper()
            if not drug or not gene:
                continue

            key = (gene, drug.lower())
            srcs, pmids = _collect_evidence_from_interaction(inter)
            evidence[key]["sources"].update(srcs)
            evidence[key]["pmids"].update(pmids)

    # Emit one row per input pair
    rows = []
    for _, r in pairs.iterrows():
        gene = r["gene_name"].upper()
        drug = r["drug_name"]
        key = (gene, drug.lower())

        srcs = sorted(evidence[key]["sources"])

        # Sort numeric PMIDs numerically when possible
        def pmid_sort_key(p: str):
            return int(p) if p.isdigit() else p

        pmids = sorted(evidence[key]["pmids"], key=pmid_sort_key)

        rows.append(
            {
                "drug_name": drug,
                "gene_name": gene,
                "dgidb_sources": srcs,
                "dgidb_pmids": pmids,
                "dgidb_source_count": len(srcs),
                "dgidb_pmid_count": len(pmids),
            }
        )

    return pd.DataFrame(rows)


# ---- Usage ----
pairz_df = ai_interactions[ai_interactions["novel?"] == False][["drug_name", "gene_name"]]
dgidb_support = fetch_dgidb_sources_pmids(pairz_df)
dgidb_support


Unnamed: 0,drug_name,gene_name,dgidb_sources,dgidb_pmids,dgidb_source_count,dgidb_pmid_count
0,enzalutamide,AR,"[CGI, CIViC, CancerCommons, ChEMBL, DTC, TTD, ...",[25634130],7,1
1,venetoclax,BCL2,"[CGI, CIViC, ChEMBL, TTD]",[],4,0
2,ibrutinib,BTK,"[CGI, CIViC, COSMIC, ChEMBL, DTC, DoCM, MyCanc...","[24869598, 25222877, 27199251]",10,3
3,pirtobrutinib,BTK,"[ChEMBL, PharmGKB]",[],2,0
4,lenalidomide,CRBN,"[CGI, CIViC, ChEMBL, GuideToPharmacology]",[],4,0
5,onc201,DRD2,[TTD],[],1,0
6,gefitinib,EGFR,"[CGI, CIViC, COSMIC, CancerCommons, ChEMBL, Cl...","[19969465, 20151670, 21274259, 22261807, 23993...",16,5
7,erlotinib,EGFR,"[CGI, CIViC, COSMIC, CancerCommons, ClearityFo...","[2302402, 11255078, 12648464, 14990632, 149906...",16,101
8,pevonedistat,NEDD8,[TTD],[],1,0
9,rosiglitazone,PPARG,"[DTC, TEND, TdgClinicalTrial]","[15745794, 16680159, 19719236, 21030263, 22070...",3,12


### Full dataset Novely Histogram

In [None]:
import plotly.graph_objects as go

pairs_all = ai_interactions[["drug_name", "gene_name"]].drop_duplicates()
dgidb_support_all = fetch_dgidb_sources_pmids(pairs_all)

def source_bin(n: int) -> str:
    if n <= 0:
        return "0"
    if n == 1:
        return "1"
    if n == 2:
        return "2"
    return "3+"

dgidb_support_all["source_bin"] = (
    dgidb_support_all["dgidb_source_count"]
    .fillna(0)
    .astype(int)
    .map(source_bin)
)
bin_order = ["0", "1", "2", "3+"]
bin_counts = dgidb_support_all["source_bin"].value_counts().reindex(bin_order, fill_value=0)


fig = go.Figure(
    data=[go.Bar(
        x=bin_counts.index,
        y=bin_counts.values,
        text=bin_counts.values,
        textposition="outside",
        cliponaxis=False,
        hovertemplate="DGIdb sources: %{x}<br>Interactions: %{y}<extra></extra>",
        marker=dict(line=dict(width=0))
    )]
)

fig.update_layout(
    title=dict(text="Sources with Claim Supporting AI Curated Interaction", x=0.5, xanchor="center"),
    template="plotly_white",
    font=dict(family="Arial", size=18, color="black"),
    paper_bgcolor="white",
    plot_bgcolor="rgba(245,247,250,1)",
    margin=dict(l=90, r=30, t=90, b=80),
    width=850,
    height=520,
    bargap=0.30,
)

fig.update_xaxes(
    title_text="# of Supporting Sources",
    categoryorder="array",
    categoryarray=bin_order,
    showline=True,
    linewidth=1,
    linecolor="rgba(0,0,0,0.35)",
    ticks="outside",
    showgrid=False,
)

fig.update_yaxes(
    title_text="# of Interactions",
    showline=True,
    linewidth=1,
    linecolor="rgba(0,0,0,0.35)",
    ticks="outside",
    showgrid=True,
    gridcolor="rgba(0,0,0,0.08)",
    zeroline=False,
    rangemode="tozero",
)

fig.update_traces(textfont=dict(size=14))
fig.show()
fig.write_image("graphs/dgidb_source_count_bins-d1.png", height=425,width=800,scale=3)
