In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from Bio import Entrez

# PMID Quality Statistics
Evaluate quality of PMIDs for results set returned from a search result and prioritze result. Things to consider:
- Recency
- Article Type
- Journal
- Impact Factor
- Scientific Field

### Load Data

In [22]:
df = pd.read_excel('data/final_results_test3.xlsx')
df.head()

Unnamed: 0.1,Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,50,37726279,venetoclax,ABCC1,YES,INHIBITING,Genetic and pharmacologic ABCC1 inactivation p...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:1747556,venetoclax,80
1,51,37726279,glutathione,ABCC1,YES,ACTIVATING,Consistent with ABCC1-specific export of gluta...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:4890,glutathione,80
2,7,37004989,Kynurenine,AhR,YES,ACTIVATING,"An endogenous AhR ligand, kynurenine (Kyn), wa...",normalize.gene.hgnc:348,AHR,100,normalize.therapy.drugbank:DB02070,Kynurenine,80
3,32,33932119,ONC201,AKT,YES,INHIBITING,"The compensatory, pro-survival PI3K/AKT/mTOR p...",normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80
4,36,26884600,ONC201,AKT,YES,INHIBITING,ONC201 (also called TIC10) is a small molecule...,normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80


In [None]:
Entrez.email = "mcannon068nw@gmail.com"
handle = Entrez.efetch(db="pubmed", id=df['pmid'].unique().tolist(), rettype="xml")
records = Entrez.read(handle)

In [24]:
for article in records['PubmedArticle']:
    if 'MeshHeadingList' in article['MedlineCitation'].keys():
        print(article['MedlineCitation']['MeshHeadingList'])
        break

[{'QualifierName': [], 'DescriptorName': StringElement('Humans', attributes={'UI': 'D006801', 'MajorTopicYN': 'N'})}, {'QualifierName': [StringElement('pharmacology', attributes={'UI': 'Q000494', 'MajorTopicYN': 'N'}), StringElement('therapeutic use', attributes={'UI': 'Q000627', 'MajorTopicYN': 'N'})], 'DescriptorName': StringElement('Sulfonamides', attributes={'UI': 'D013449', 'MajorTopicYN': 'N'})}, {'QualifierName': [], 'DescriptorName': StringElement('ATP-Binding Cassette Transporters', attributes={'UI': 'D018528', 'MajorTopicYN': 'N'})}, {'QualifierName': [], 'DescriptorName': StringElement('Antineoplastic Agents', attributes={'UI': 'D000970', 'MajorTopicYN': 'Y'})}, {'QualifierName': [StringElement('drug therapy', attributes={'UI': 'Q000188', 'MajorTopicYN': 'N'}), StringElement('genetics', attributes={'UI': 'Q000235', 'MajorTopicYN': 'N'})], 'DescriptorName': StringElement('Leukemia, Myeloid, Acute', attributes={'UI': 'D015470', 'MajorTopicYN': 'Y'})}, {'QualifierName': [], 'De

In [31]:
for article in records['PubmedArticle']:
    pmid = int(article['MedlineCitation']['PMID'])

    # Extract article type(s) as strings
    types = article['MedlineCitation']['Article']['PublicationTypeList']
    article_types = [str(t) for t in types]
    article_types_str = '; '.join(article_types)  # Or use ', '.join(...) if preferred
    journal = article['MedlineCitation']['Article']['Journal']['Title']

    # Extract publication year
    pub_date_info = article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
    if 'Year' in pub_date_info:
        pub_year = pub_date_info['Year']
    elif 'MedlineDate' in pub_date_info:
        pub_year = int(pub_date_info['MedlineDate'].split(' ')[0])
    else:
        pub_year = None

    # Extract MeSH Terms
    mesh_terms = []
    if 'MeshHeadingList' in article['MedlineCitation']:
        for mh in article['MedlineCitation']['MeshHeadingList']:
            descriptor = str(mh['DescriptorName'])
            qualifiers = [str(q) for q in mh.get('QualifierName', [])]
            if qualifiers:
                for q in qualifiers:
                    mesh_terms.append(f"{descriptor}/{q}")
            else:
                mesh_terms.append(descriptor)
    mesh_terms_str = '; '.join(mesh_terms)

    # Assign values to DataFrame
    df.loc[df['pmid'] == pmid, 'publish_year'] = pub_year
    df.loc[df['pmid'] == pmid, 'article_type'] = article_types_str
    df.loc[df['pmid'] == pmid, 'journal'] = journal
    df.loc[df['pmid'] == pmid, 'mesh_terms'] = mesh_terms_str

df


Unnamed: 0.1,Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,...,drug_match_type,publish_year,article_type,journal,SJR Best Quartile,SJR,H index,Ref. / Doc.,Categories,mesh_terms
0,50,37726279,venetoclax,ABCC1,YES,INHIBITING,Genetic and pharmacologic ABCC1 inactivation p...,normalize.gene.hgnc:51,ABCC1,100,...,80,2023,"Journal Article; Research Support, N.I.H., Ext...",Nature communications,Q1,4.761,577.0,6352,"Biochemistry, Genetics and Molecular Biology (...",Humans; Sulfonamides/pharmacology; Sulfonamide...
1,51,37726279,glutathione,ABCC1,YES,ACTIVATING,Consistent with ABCC1-specific export of gluta...,normalize.gene.hgnc:51,ABCC1,100,...,80,2023,"Journal Article; Research Support, N.I.H., Ext...",Nature communications,Q1,4.761,577.0,6352,"Biochemistry, Genetics and Molecular Biology (...",Humans; Sulfonamides/pharmacology; Sulfonamide...
2,7,37004989,Kynurenine,AhR,YES,ACTIVATING,"An endogenous AhR ligand, kynurenine (Kyn), wa...",normalize.gene.hgnc:348,AHR,100,...,80,2023,"Journal Article; Research Support, N.I.H., Ext...",Molecular metabolism,Q1,3.235,115.0,6490,Cell Biology (Q1); Molecular Biology (Q1),Mice; Animals; Reactive Oxygen Species/metabol...
3,32,33932119,ONC201,AKT,YES,INHIBITING,"The compensatory, pro-survival PI3K/AKT/mTOR p...",normalize.gene.hgnc:391,AKT1,60,...,80,2021,Journal Article,Cancer medicine,Q1,1.201,95.0,4341,"Radiology, Nuclear Medicine and Imaging (Q1); ...","Antineoplastic Agents/pharmacology; Carcinoma,..."
4,36,26884600,ONC201,AKT,YES,INHIBITING,ONC201 (also called TIC10) is a small molecule...,normalize.gene.hgnc:391,AKT1,60,...,80,2016,"Journal Article; Research Support, N.I.H., Ext...",Science signaling,Q1,2.143,187.0,5034,Biochemistry (Q1); Cell Biology (Q1); Molecula...,Activating Transcription Factor 4/genetics; Ac...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,35,26884600,ONC201,TRAIL,YES,ACTIVATING,ONC201 (also called TIC10) is a small molecule...,normalize.gene.hgnc:11925,TNFSF10,60,...,80,2016,"Journal Article; Research Support, N.I.H., Ext...",Science signaling,Q1,2.143,187.0,5034,Biochemistry (Q1); Cell Biology (Q1); Molecula...,Activating Transcription Factor 4/genetics; Ac...
133,56,37942576,melphalan,Trip13,YES,INHIBITING,Combining with melphalan or HDAC inhibitor pan...,normalize.gene.hgnc:12307,TRIP13,100,...,80,2023,"Journal Article; Research Support, Non-U.S. Gov't",Cancer medicine,Q1,1.201,95.0,4341,"Radiology, Nuclear Medicine and Imaging (Q1); ...",Animals; Humans; Mice; Cell Cycle; Cell Cycle ...
134,57,37942576,panobinostat,Trip13,YES,INHIBITING,Combining with melphalan or HDAC inhibitor pan...,normalize.gene.hgnc:12307,TRIP13,100,...,80,2023,"Journal Article; Research Support, Non-U.S. Gov't",Cancer medicine,Q1,1.201,95.0,4341,"Radiology, Nuclear Medicine and Imaging (Q1); ...",Animals; Humans; Mice; Cell Cycle; Cell Cycle ...
135,135,39047882,galangin,TRPV1,YES,INHIBITING,"Molecularly, galangin demonstrated favorable b...",normalize.gene.hgnc:12716,TRPV1,100,...,80,2024,Journal Article,Journal of ethnopharmacology,Q1,1.142,243.0,6432,Drug Discovery (Q1); Pharmacology (Q1),Animals; Flavonoids/pharmacology; Ethanol; TRP...


### SJR Score / H-Index

In [None]:
# Load SJR data
sjr = pd.read_csv("data/scimagojr2024.csv", sep=';')

# Normalize journal names for matching
df['journal_norm'] = df['journal'].str.lower().str.replace(r'[^a-z0-9 ]', '', regex=True)
sjr['Title_norm'] = sjr['Title'].str.lower().str.replace(r'[^a-z0-9 ]', '', regex=True)

# Merge on normalized title
df = df.merge(sjr[['Title_norm', 'SJR Best Quartile', 'SJR', 'H index', 'Ref. / Doc.', 'Categories']],
              left_on='journal_norm', right_on='Title_norm', how='left')

# Clean up
df = df.drop(columns=['journal_norm', 'Title_norm'])


#### Some Definitions from SJR
*via https://www.scimagojr.com/help.php*  
  
**SJR (SCImago Journal Rank) indicator**  
Represents the average number of weighted citations received in the selected year by documents published in the chosen journal during the previous three years. For example, it counts the weighted citations received in year X for documents published in the journal in years X-1, X-2, and X-3. See detailed description of SJR (PDF). The set of journals are ranked according to their SJR and divided into four equal groups (Quartiles), with Q1 representing the highest value and Q4 the lowest. The Quartile data is only available for Journals and Book Series.  
  
**H Index**  
The h-index expresses the journal's number of articles (h) that have received at least h citations. It quantifies both journal scientific output and scientific impact; it is also applicable to scientists, countries, etc. (See Hirsch, J.E. (2005)).  
  
**References per Document**  
Average number of references per document in the selected year.

In [28]:
df['SJR'] = df['SJR'].apply(lambda x: str(x).replace(',', '.') if isinstance(x, str) else x)
df[['journal','SJR','H index']].value_counts().sort_index(level="SJR", ascending=False)

journal                                                                          SJR    H index
European urology                                                                 8.529  274.0      5
Blood                                                                            5.823  540.0      1
Circulation research                                                             4.897  402.0      1
Nature communications                                                            4.761  577.0      2
Autophagy                                                                        4.447  202.0      7
PLoS medicine                                                                    4.279  289.0      2
Cancer research                                                                  3.879  510.0      4
Diabetologia                                                                     3.617  266.0      1
EBioMedicine                                                                     3.591  136.0   

In [68]:
import pandas as pd
import plotly.express as px

# --- Build (pmid, journal, SJR, H index) pairs ---
journal_pairs = df[["pmid", "journal", "SJR", "H index"]].copy()

# --- Deduplicate by PMID ---
journal_pairs = journal_pairs.drop_duplicates(subset=["pmid"])

# --- Count number of unique PMIDs per journal ---
journal_counts = (
    journal_pairs.groupby(["journal", "SJR", "H index"])["pmid"]
    .nunique()
    .reset_index(name="count")
    .sort_values("count", ascending=False)
)

# Select top N journals
top_n = 20
journal_top = journal_counts.head(top_n)

# --- Plotly bar chart ---
fig = px.bar(
    journal_top,
    x="count",
    y="journal",
    orientation="h",
    text="count",
    title=f"Top {top_n} Journals by Article Count \n(unique articles n={df['pmid'].nunique()})",
    labels={"count": "Number of Unique Articles", "journal": "Journal"}
)

fig.update_traces(
    texttemplate="%{text}",
    textposition="outside",
    marker=dict(line=dict(width=0.5, color="black"))
)

fig.update_layout(
    template="plotly_white",
    title=dict(x=0.5, font=dict(size=22)),
    xaxis=dict(showgrid=True, gridcolor="lightgray", title_font=dict(size=16)),
    yaxis=dict(title_font=dict(size=16), tickfont=dict(size=12), categoryorder="total ascending"),
    font=dict(size=14),
    margin=dict(l=250, r=40, t=80, b=40)  # wider margin for long journal names
)
fig.write_image('graphs/top_journals_by_article-d1.png', width=1000, height=400, scale=3)
fig.show()


In [71]:
import pandas as pd
import plotly.express as px

# Deduplicate by PMID
tdf = df.drop_duplicates(subset=["pmid"]).copy()

# Ensure numerics
tdf['SJR'] = pd.to_numeric(tdf['SJR'], errors='coerce')
tdf['H index'] = pd.to_numeric(tdf['H index'], errors='coerce')

# Plot
fig_sjr = px.histogram(
    tdf,
    x="SJR",
    nbins=30,
    marginal="box",
    opacity=0.75,
    title="Distribution of SJR Scores",
    labels={"SJR": "SJR Score", "count": "Frequency"},
)

fig_sjr.update_layout(
    template="plotly_white",
    title=dict(x=0.5, font=dict(size=20)),
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    font=dict(size=14),
    bargap=0.05
)


fig_hindex = px.histogram(
    tdf,
    x="H index",
    nbins=30,
    marginal="box",
    opacity=0.75,
    title="Distribution of H-index Scores",
    labels={"H index": "H-index", "count": "Frequency"},
)

fig_hindex.update_layout(
    template="plotly_white",
    title=dict(x=0.5, font=dict(size=20)),
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    font=dict(size=14),
    bargap=0.05
)

fig_sjr.show()
fig_sjr.write_image('graphs/sjr_distribution-d1.png', width=1000, height=400, scale=3)

fig_hindex.show()
fig_hindex.write_image('graphs/h_index_distribution-d1.png', width=1000, height=400, scale=3)

**SJR Score Interpretation**:  
The SJR is normalized so that the average SJR score across all journals in a particular subject area is 1.0. A journal with an SJR greater than 1.0 is considered to have above-average citation potential and prestige within its discipline. For example in this dataset, the SJR of the prestigious hematology journal *Blood* is  5.823.  
  
For the *h-index*, higher is better. For journals, it is the number *h* papers that have been cited at least *h* times. In this dataset, the h-index of the prestigious hematology journal *Blood* is 540. 

### MeSH Headings

In [74]:
mesh_pairs = []
for idx, row in df.iterrows():
    pmid = row['pmid']
    if pd.notnull(row['mesh_terms']):
        terms = row['mesh_terms'].split('; ')
        for term in terms:
            if term.strip():  # drop empty strings
                mesh_pairs.append((pmid, term.strip()))

mesh_df = pd.DataFrame(mesh_pairs, columns=["pmid", "MeSH Term"])

# Dedup
mesh_df = mesh_df.drop_duplicates(subset=["pmid", "MeSH Term"])

mesh_counts = (
    mesh_df.groupby("MeSH Term")["pmid"]
    .nunique()
    .reset_index(name="count")
    .sort_values("count", ascending=False)
)

top_n = 20
mesh_top = mesh_counts.head(top_n)

# Plot
fig = px.bar(
    mesh_top,
    x="count",
    y="MeSH Term",
    orientation="h",
    text="count",
    title=f"Top {top_n} MeSH Terms \n(MeSH terms n={len(set(mesh_terms))})",
    labels={"count": "Number of Articles", "MeSH Term": "MeSH Term"}
)

fig.update_traces(
    texttemplate="%{text}",
    textposition="outside",
    marker=dict(line=dict(width=0.5, color="black"))
)

fig.update_layout(
    template="plotly_white",
    title=dict(x=0.5, font=dict(size=22)),
    xaxis=dict(showgrid=True, gridcolor="lightgray", title_font=dict(size=16)),
    yaxis=dict(title_font=dict(size=16), tickfont=dict(size=12), categoryorder="total ascending"),
    font=dict(size=14),
    margin=dict(l=150, r=40, t=80, b=40)
)

fig.show()
fig.write_image('graphs/top_mesh_terms-d1.png', width=1000, height=600, scale=3)
