In [1]:
import pickle
from neo4j import GraphDatabase
from dotenv import dotenv_values
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from cdlib import algorithms, viz, evaluation

from hp_nlp_graph.neo4j import add_metrics_to_neo4j

Note: to be able to use all crisp methods, you need to install some additional packages:  {'infomap', 'bayanpy', 'wurlitzer', 'graph_tool'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'ASLPAw', 'pyclustering'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'infomap', 'wurlitzer'}


In [2]:
NUMBER_OF_BOOKS = 7
interactions = {}
for book_number in range(1, NUMBER_OF_BOOKS + 1):
    with open(f"./data/processed/{book_number}/interactions_by_chapter.pkl", "rb") as f:
        interactions[book_number] = pickle.load(f)

In [3]:
dfs = []
for book_number, interactions_by_chapter in interactions.items():
    for chapter, distances in interactions_by_chapter.items():
        df = pd.DataFrame.from_dict(
            dict(distances), orient="index", columns=["weight"]
        ).reset_index(names=["characters"])
        df[["source", "target"]] = pd.DataFrame(df.characters.tolist(), index=df.index)
        df["chapter"] = chapter
        df["book"] = 1
        df = df[["source", "target", "weight", "chapter", "book"]]
        dfs.append(df)
df = (
    pd.concat(dfs)
    .groupby(["source", "target"])
    .weight.sum()
    .sort_values(ascending=False)
    .reset_index()
)

In [4]:
df.to_csv("./data/processed/series/interactions.csv", index=False)

In [5]:
G = nx.from_pandas_edgelist(df, "source", "target", ["weight"], create_using=nx.Graph())

In [6]:
eigen_centrality = nx.eigenvector_centrality(G, weight="weight")
betweenness_centrality = nx.betweenness_centrality(G, weight="weight")
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
pagerank = nx.pagerank(G, weight="weight")
hub_centrality, authority_centrality = nx.hits(G)
degree = dict(nx.degree(G))
weighted_degree = dict(nx.degree(G, weight="weight"))

metrics = {
    "eigen_centrality": eigen_centrality,
    "betweenness_centrality": betweenness_centrality,
    "degree_centrality": degree_centrality,
    "closeness_centrality": closeness_centrality,
    "pagerank": pagerank,
    "hub": hub_centrality,
    "authority": authority_centrality,
    "degree": degree,
    "weighted_degree": weighted_degree,
}
metrics_df = pd.DataFrame.from_dict(metrics)
metrics_df.index.name = "name"
metrics_df

Unnamed: 0_level_0,eigen_centrality,betweenness_centrality,degree_centrality,closeness_centrality,pagerank,hub,authority,degree,weighted_degree
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Harry Potter,6.211080e-01,0.150211,0.704403,0.747075,0.200125,1.772332e-01,1.772332e-01,336,57819
Ronald Weasley,5.288815e-01,0.095374,0.503145,0.643052,0.088294,1.509292e-01,1.509292e-01,240,27028
Hermione Granger,4.923366e-01,0.048571,0.440252,0.617017,0.075943,1.404985e-01,1.404985e-01,210,23832
Albus Dumbledore,1.559530e-01,0.073078,0.379455,0.589948,0.033282,4.450573e-02,4.450573e-02,181,9434
Rubeus Hagrid,1.186841e-01,0.038142,0.264151,0.543650,0.019631,3.386915e-02,3.386915e-02,126,5751
...,...,...,...,...,...,...,...,...,...
Yvonne,4.242206e-07,0.000000,0.002096,0.315317,0.000318,1.210461e-07,1.210461e-07,1,1
Florence,1.283312e-05,0.000218,0.004193,0.376614,0.000320,3.661841e-06,3.661841e-06,2,2
Ragnuk,3.561001e-05,0.000979,0.004193,0.402828,0.000320,1.016120e-05,1.016120e-05,2,2
Alberic Grunnion,3.233921e-05,0.000953,0.008386,0.425311,0.000538,9.229169e-06,9.229169e-06,4,4


In [7]:
louvain_comms = algorithms.louvain(G, weight="weight")
louvain = pd.DataFrame.from_dict(
    dict(louvain_comms.to_node_community_map()),
    orient="index",
    columns=["louvain"],
)
louvain.index.name = "name"

leiden_comms = algorithms.leiden(G, weights="weight")
leiden = pd.DataFrame.from_dict(
    dict(leiden_comms.to_node_community_map()),
    orient="index",
    columns=["leiden"],
)
leiden.index.name = "name"

girvan_newman_comms = algorithms.girvan_newman(G, level=5)
girvan_newman = pd.DataFrame.from_dict(
    dict(girvan_newman_comms.to_node_community_map()),
    orient="index",
    columns=["girvan_newman"],
)
girvan_newman.index.name = "name"

spectral_comms = algorithms.spectral(G, kmax=8)
spectral = pd.DataFrame.from_dict(
    dict(spectral_comms.to_node_community_map()),
    orient="index",
    columns=["spectral"],
)
spectral.index.name = "name"

In [8]:
metrics_df = metrics_df.join([louvain, leiden, girvan_newman, spectral])

In [9]:
metrics_df.to_csv(f"data/processed/series/metrics.csv")

In [10]:
for col in metrics_df.columns:
    nx.set_node_attributes(G, metrics_df[col].to_dict(), col)

In [11]:
nx.write_gexf(G, f"data/processed/series/graph.gexf")

In [15]:
nx.write_gml(G, f"data/processed/series/graph.adjlist")

In [21]:
from networkx.readwrite import json_graph
import json

In [22]:
with open(f"data/processed/series/graph.json", "w") as f:
    json.dump(json_graph.node_link_data(G), f)