In [35]:
from bs4 import BeautifulSoup 
import requests 
import re
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter

In [36]:
folder_path = "Works/"
papers = pd.read_csv(folder_path + "IC2S2_papers.csv")

In [37]:
def pair_exists(df, author_a, author_b):
    mask = ((df["author_1"] == author_a) & (df["author_2"] == author_b)) | \
           ((df["author_1"] == author_b) & (df["author_2"] == author_a))
    match_index = df[mask].index
    return match_index, mask.any()  # Returns True if the pair exists

In [38]:
edge_dict = {}
for authors in papers["author_ids"].values:
    cleaned_string = authors.replace("[", "").replace("]", "").replace("'", "")
    authors = np.array(cleaned_string.split(", "))
    
    for i,author1 in enumerate(authors):
        for j in range(i+1,len(authors)):
            author2 =    authors[j]
            author_pair = tuple(sorted([author1,author2]))
            if author_pair in edge_dict:
                edge_dict[author_pair] += 1
            else:
                edge_dict[author_pair] = 1
df = pd.DataFrame([{"author_1": pair[0], "author_2": pair[1], "weight": weight} for pair, weight in edge_dict.items()])
df

In [39]:
#df.to_csv("Authors/author_edgelist.csv", index = False)

In [40]:
df = pd.read_csv("Authors/author_edgelist.csv")

In [41]:
G = nx.Graph()

edges = [(row['author_1'], row['author_2'], row['weight']) for index, row in df.iterrows()]

G.add_weighted_edges_from(edges)

In [42]:
nodes_N = G.nodes
author_N = len(nodes_N) #the number of authors
author_N

14158

In [43]:
weight_sum = int(sum(df["weight"].values))
weight_sum

80381

In [44]:
nx.is_connected(G)

False

The graph is disconnected. There are isolated groups of nodes with no path connecting them.

In [45]:
list_of_connected_comp = list(nx.connected_components(G))
print(len(list_of_connected_comp)) # number of connected components

223


In [46]:
list(nx.isolates(G)) 

[]

There are 223 connected components and no isolated nodes in the network. It was expected that there was no isolated nodes as the network was created form a edgelist.

In [47]:
density = weight_sum/((author_N*(author_N-1))/2)
density #very low density

0.00080206628383695

223 connected componnents suggests that G is poorly connected (low connectivity). This is underlined by the low density. This means that the data suggests that the CSS reasearchers are poorly connected and work in around 233 isolated groups.

In [50]:
# Get the degree of each node (as a list of values)
degrees = [degree for node, degree in G.degree()]

# Compute the required statistics
average_degree = np.mean(degrees)
median_degree = np.median(degrees)
counter = Counter(degrees)
mode_degree, mode_count = counter.most_common(1)[0]
minimum_degree = np.min(degrees)
maximum_degree = np.max(degrees)


In [51]:
# Calculate node strength (weighted degree) for each node
strengths = [strength for node, strength in G.degree(weight='weight')]

# Compute the required statistics
average_strength = np.mean(strengths)
median_strength = np.median(strengths)
counter_strength = Counter(strengths)
mode_strength, mode_count = counter_strength.most_common(1)[0]
minimum_strength = np.min(strengths)
maximum_strength = np.max(strengths)

In [53]:
# Print the computed degree statistics
print("Average degree:", average_degree)
print("Median degree:", median_degree)
print("Mode degree:", mode_degree)
print("Minimum degree:", minimum_degree)
print("Maximum degree:", maximum_degree)

print()

# Print the computed node strength statistics
print("Average strength:", average_strength)
print("Median strength:", median_strength)
print("Mode strength:", mode_strength)
print("Minimum strength:", minimum_strength)
print("Maximum strength:", maximum_strength)

Average degree: 6.523944059895466
Median degree: 5.0
Mode degree: 3
Minimum degree: 1
Maximum degree: 593

Average strength: 11.3548523802797
Median strength: 6.0
Mode strength: 3
Minimum strength: 1
Maximum strength: 1295


We can tell from the degree information that on average each author collaborates with around 6 to 7 other authors. We can tell from the median and mode and most authors work in small networks. We can also see that there is a large range of collaborations $[1..593]$, with some authors only having 1 collaboration while others have hundreds.

When looking at the strength of the connections we can see that the average is 11.35 which indicates that people prefer to repeat collaborations rather than to make new connections. We can also see this from the median and mode where we can see that most authors collaborate with small groups. We can also see from the maximum that some authors co-author many papers, suggesting that they have a large network connection.

In [49]:
weighted_degree_dict = dict(G.degree(weight="weight"))

most_connected_weighted_nodes = sorted(weighted_degree_dict.items(), key=lambda x: x[1], reverse=True)

url = 'https://api.openalex.org/authors/'

print("Top 5:")
for id_url, weighted_degree in most_connected_weighted_nodes[:5]:
    index = id_url.find("A")
    id = id_url[index:]
    response = requests.get(url + id).json()
    name = response["display_name"]

    print(f"Node {id_url} with weight: {weighted_degree}")
    print(name)
    
    field = response["topics"][0]["field"]["display_name"]
    subfield = response["topics"][0]["subfield"]["display_name"]
    print(f"Field: {field}, Subfield: {subfield}")
    print("")

Top 5:
Node https://openalex.org/A5100381753 with weight: 1295
Shuicheng Yan
Field: Computer Science, Subfield: Computer Vision and Pattern Recognition

Node https://openalex.org/A5007176508 with weight: 937
Alex Pentland
Field: Social Sciences, Subfield: Transportation

Node https://openalex.org/A5021346979 with weight: 845
Filippo Menczer
Field: Computer Science, Subfield: Information Systems

Node https://openalex.org/A5048877432 with weight: 641
Bruno Lepri
Field: Social Sciences, Subfield: Transportation

Node https://openalex.org/A5011228873 with weight: 637
Alessandro Flammini
Field: Physics and Astronomy, Subfield: Statistical and Nonlinear Physics



The last one is odd... 
Explain the results: