In [None]:
!pip install biopython pandas scholarly
from Bio import Entrez
from collections import defaultdict
import pandas as pd
import json
from scholarly import scholarly
from itertools import combinations
import difflib
!pip install --upgrade scholarly



In [None]:
Entrez.email = "mforsnes@uw.edu"
search_term = "oncology"
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=100000)
record = Entrez.read(handle)
handle.close()

idlist = record["IdList"]
handle = Entrez.efetch(db="pubmed", id=idlist, rettype="xml")
records = Entrez.read(handle, validate=False)

author_publications = defaultdict(int)
author_affiliations = defaultdict(set)

for record in records["PubmedArticle"]:
    if 'MedlineCitation' in record and 'Article' in record['MedlineCitation'] and 'AuthorList' in record['MedlineCitation']['Article']:
        authors = record["MedlineCitation"]["Article"]["AuthorList"]
        for author in authors:
            if 'LastName' in author and 'ForeName' in author:
                author_name = " ".join([author["ForeName"], author["LastName"]])
                author_publications[author_name] += 1
                if 'AffiliationInfo' in author and author['AffiliationInfo']:
                    for affiliation in author['AffiliationInfo']:
                        author_affiliations[author_name].add(affiliation['Affiliation'])

sorted_author_publications = sorted(author_publications.items(), key=lambda item: item[1], reverse=True)
top_authors = sorted_author_publications[:1000]

def clean_author_name(author_name):
    parts = author_name.split()
    if len(parts) > 1:
        cleaned_name = " ".join([parts[0], parts[-1]])
    else:
        cleaned_name = author_name
    return cleaned_name

author_data = []
for i, author in enumerate(top_authors, start=1):
    cleaned_author_name = clean_author_name(author[0])
    affiliations = "; ".join(author_affiliations[author[0]])
    author_data.append((i, cleaned_author_name, author[1], affiliations))

df = pd.DataFrame(author_data, columns=['Author ID', 'Author', 'Publications', 'Affiliations'])

def create_coauthor_edges(records, df):
    coauthor_edges = []
    author_publications = defaultdict(list)

    author_id_map = dict(zip(df['Author'], df['Author ID']))

    for record in records["PubmedArticle"]:
        if 'MedlineCitation' in record and 'Article' in record['MedlineCitation'] and 'AuthorList' in record['MedlineCitation']['Article']:
            authors = record["MedlineCitation"]["Article"]["AuthorList"]
            author_ids = []
            for author in authors:
                if 'LastName' in author and 'ForeName' in author:
                    author_name = " ".join([author["ForeName"], author["LastName"]])
                    cleaned_author_name = clean_author_name(author_name)
                    closest_match = difflib.get_close_matches(cleaned_author_name, author_id_map.keys(), n=1, cutoff=0.8)
                    if closest_match:
                        author_id = author_id_map[closest_match[0]]
                        author_ids.append(author_id)
                        author_publications[author_id].append(record["MedlineCitation"]["PMID"])

            for pair in combinations(author_ids, 2):
                coauthor_edges.append(pair)

    coauthor_edges = list(set(coauthor_edges))

    edge_data = []
    for edge in coauthor_edges:
        shared_publications = list(set(author_publications[edge[0]]) & set(author_publications[edge[1]]))
        edge_data.append((edge[0], edge[1], shared_publications))

    df_coauthor_edges = pd.DataFrame(edge_data, columns=['Author ID 1', 'Author ID 2', 'Shared Publications'])

    return df_coauthor_edges

df_coauthor_edges = create_coauthor_edges(records, df)

print(df)
print(df_coauthor_edges)

     Author ID          Author  Publications  \
0            1        Wei Wang            36   
1            2         Wei Liu            23   
2            3       Ying Wang            22   
3            4       Xin Zhang            22   
4            5        Yu Zhang            21   
..         ...             ...           ...   
995        996    Drew Pardoll             3   
996        997  Brian Gonzalez             3   
997        998      Andrew Wei             3   
998        999      Chunyan Li             3   
999       1000   Tingting Deng             3   

                                          Affiliations  
0    Department of Gastrointestinal, Bariatric and ...  
1    Department of Mammary Medicine, Affiliated Tum...  
2    Department of Cardiovascular Medicine, Mayo Cl...  
3    Department of Radiotherapy, Cancer Center, Sta...  
4    Institute of Spine and Spinal Cord, The First ...  
..                                                 ...  
995  Bloomberg~Kimmel In

In [None]:
def get_author_info(df):
    author_info = []
    for _, row in df.iterrows():
        author_id = row['Author ID']
        author_name = row['Author']
        try:
            search_query = scholarly.search_author(author_name)
            author = next(search_query, None)
            if author is not None:
                author = scholarly.fill(author)
                years = [int(pub['bib']['pub_year']) for pub in author['publications'] if 'pub_year' in pub['bib']]
                if years:
                    experience = max(years) - min(years)
                else:
                    experience = 0
                citations_per_year = author.get('cites_per_year', {})
                total_citations = sum(citations_per_year.values())
                num_years = len(citations_per_year)
                if num_years > 0:
                    avg_citations_per_year = int(total_citations / num_years)
                else:
                    avg_citations_per_year = 0
                author_info.append({
                    "Author": author_name,
                    "Author ID": author_id,
                    "Publications": len(author['publications']),
                    "Affiliations": row['Affiliations'],
                    "Citations": author['citedby'],
                    "Years of experience": experience,
                    "h-index": author.get('hindex', 0),
                    "i10-index": author.get('i10index', 0),
                    "Interests": author.get('interests', []),
                    "Average citations per year": avg_citations_per_year
                })
        except Exception as e:
            print(f"An error occurred while processing the author {author_name}: {e}")
    return author_info

author_info = get_author_info(df)
df = pd.DataFrame(author_info)
print(df)

In [None]:
def create_affiliation_edges(df):
    edge_data = []

    for i, row1 in df.iterrows():
        for j, row2 in df.iterrows():
            if i < j:
                affiliations1 = row1['Affiliations'].split('; ')
                affiliations2 = row2['Affiliations'].split('; ')
                common_affiliations = set(affiliations1).intersection(set(affiliations2))

                if common_affiliations:
                    author_id1 = row1['Author ID']
                    author_id2 = row2['Author ID']
                    edge_data.append((author_id1, author_id2, list(common_affiliations)))

    df_affiliation_edges = pd.DataFrame(edge_data, columns=['Author ID 1', 'Author ID 2', 'Shared Affiliations'])
    return df_affiliation_edges

df_affiliation_edges = create_affiliation_edges(df)
print(df_affiliation_edges)


In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G_coauthorship = nx.Graph()
for _, row in df_coauthor_edges.iterrows():
    author_id1, author_id2 = row['Author ID 1'], row['Author ID 2']
    G_coauthorship.add_edge(author_id1, author_id2)

G_affiliation = nx.Graph()
for _, row in df_affiliation_edges.iterrows():
    author_id1, author_id2 = row['Author ID 1'], row['Author ID 2']
    G_affiliation.add_edge(author_id1, author_id2)

pos = nx.spring_layout(G_coauthorship)
nx.draw_networkx_nodes(G_coauthorship, pos, node_size=100)
nx.draw_networkx_edges(G_coauthorship, pos, edge_color='gray', alpha=0.5)
nx.draw_networkx_labels(G_coauthorship, pos, font_size=8)
plt.title('Co-authorship Graph')
plt.axis('off')
plt.show()

pos = nx.spring_layout(G_affiliation)
nx.draw_networkx_nodes(G_affiliation, pos, node_size=100)
nx.draw_networkx_edges(G_affiliation, pos, edge_color='gray', alpha=0.5)
nx.draw_networkx_labels(G_affiliation, pos, font_size=8)
plt.title('Affiliation Graph')
plt.axis('off')
plt.show()

coauthorship_centrality = nx.degree_centrality(G_coauthorship)
affiliation_centrality = nx.degree_centrality(G_affiliation)

df['Coauthorship Centrality'] = df['Author ID'].map(coauthorship_centrality)
df['Affiliation Centrality'] = df['Author ID'].map(affiliation_centrality)

print(df)

In [None]:
from google.colab import files

df.to_csv('df.csv', index=False)

files.download('df.csv')