In [1]:
import pickle
from neo4j import GraphDatabase
from dotenv import dotenv_values
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from cdlib import algorithms, viz, evaluation
from networkx.readwrite import json_graph
import json

from hp_nlp_graph.neo4j import add_metrics_to_neo4j

Note: to be able to use all crisp methods, you need to install some additional packages:  {'bayanpy', 'infomap', 'wurlitzer', 'graph_tool'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'pyclustering', 'ASLPAw'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'infomap', 'wurlitzer'}


In [2]:
NUMBER_OF_BOOKS = 7
interactions = {}
for book_number in range(1, NUMBER_OF_BOOKS + 1):
    with open(f"./data/processed/{book_number}/interactions_by_chapter.pkl", "rb") as f:
        interactions[book_number] = pickle.load(f)

In [3]:
dfs = []
for book_number, interactions_by_chapter in interactions.items():
    for chapter, distances in interactions_by_chapter.items():
        df = pd.DataFrame.from_dict(
            dict(distances), orient="index", columns=["weight"]
        ).reset_index(names=["characters"])
        df[["source", "target"]] = pd.DataFrame(df.characters.tolist(), index=df.index)
        df["chapter"] = chapter
        df["book"] = 1
        df = df[["source", "target", "weight", "chapter", "book"]]
        dfs.append(df)
df = (
    pd.concat(dfs)
    .groupby(["source", "target"])
    .weight.sum()
    .sort_values(ascending=False)
    .reset_index()
)

In [4]:
df.to_csv("./data/processed/series/interactions.csv", index=False)

In [5]:
G = nx.from_pandas_edgelist(df, "source", "target", ["weight"], create_using=nx.Graph())

In [6]:
G.remove_nodes_from(["Harry Potter", "Ronald Weasley", "Hermione Granger"])

In [7]:
eigen_centrality = nx.eigenvector_centrality(G, weight="weight")
betweenness_centrality = nx.betweenness_centrality(G, weight="weight")
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
pagerank = nx.pagerank(G, weight="weight")
hub_centrality, authority_centrality = nx.hits(G)
degree = dict(nx.degree(G))
weighted_degree = dict(nx.degree(G, weight="weight"))

metrics = {
    "eigen_centrality": eigen_centrality,
    "betweenness_centrality": betweenness_centrality,
    "degree_centrality": degree_centrality,
    "closeness_centrality": closeness_centrality,
    "pagerank": pagerank,
    "hub": hub_centrality,
    "authority": authority_centrality,
    "degree": degree,
    "weighted_degree": weighted_degree,
}
metrics_df = pd.DataFrame.from_dict(metrics)
metrics_df.index.name = "name"
metrics_df

Unnamed: 0_level_0,eigen_centrality,betweenness_centrality,degree_centrality,closeness_centrality,pagerank,hub,authority,degree,weighted_degree
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Albus Dumbledore,4.911296e-01,0.080056,0.375527,0.555708,0.058499,8.344666e-02,8.344666e-02,178,4824
Rubeus Hagrid,1.756018e-01,0.047762,0.259494,0.508697,0.025263,2.983804e-02,2.983804e-02,123,1918
Tom Riddle,3.352511e-01,0.043867,0.289030,0.521201,0.030719,5.696054e-02,5.696054e-02,137,2453
Severus Snape,3.476001e-01,0.031619,0.253165,0.506778,0.032643,5.905945e-02,5.905945e-02,120,2880
Sirius Black,2.266102e-01,0.014300,0.196203,0.480773,0.023314,3.850430e-02,3.850430e-02,93,2081
...,...,...,...,...,...,...,...,...,...
Joey Jenkins,3.914662e-81,0.000000,0.000000,0.000000,0.000329,-0.000000e+00,-1.179354e-18,0,0
Marsh,3.914662e-81,0.000000,0.000000,0.000000,0.000329,-0.000000e+00,-0.000000e+00,0,0
Graham Pritchard,3.020844e-12,0.000000,0.002110,0.175245,0.001048,5.131320e-13,5.131322e-13,1,1
Golgomath,1.146387e-04,0.000000,0.002110,0.328085,0.000341,1.947813e-05,1.947813e-05,1,1


In [9]:
louvain_comms = algorithms.louvain(G, weight="weight")
louvain = pd.DataFrame.from_dict(
    dict(louvain_comms.to_node_community_map()),
    orient="index",
    columns=["louvain"],
)
louvain.index.name = "name"

leiden_comms = algorithms.leiden(G, weights="weight")
leiden = pd.DataFrame.from_dict(
    dict(leiden_comms.to_node_community_map()),
    orient="index",
    columns=["leiden"],
)
leiden.index.name = "name"

girvan_newman_comms = algorithms.girvan_newman(G, level=5)
girvan_newman = pd.DataFrame.from_dict(
    dict(girvan_newman_comms.to_node_community_map()),
    orient="index",
    columns=["girvan_newman"],
)
girvan_newman.index.name = "name"

# spectral_comms = algorithms.spectral(G, kmax=8)
# spectral = pd.DataFrame.from_dict(
#     dict(spectral_comms.to_node_community_map()),
#     orient="index",
#     columns=["spectral"],
# )
# spectral.index.name = "name"

In [10]:
metrics_df = metrics_df.join(
    [
        louvain,
        leiden,
        girvan_newman,
        #   spectral
    ]
)

In [11]:
metrics_df.to_csv(f"data/processed/series/metrics_wo_trio.csv")

In [12]:
for col in metrics_df.columns:
    nx.set_node_attributes(G, metrics_df[col].to_dict(), col)

In [13]:
nx.write_gexf(G, f"data/processed/series/graph_wo_trio.gexf")

In [14]:
with open(f"data/processed/series/graph_wo_trio.json", "w") as f:
    json.dump(json_graph.node_link_data(G), f)

In [15]:
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G0 = G.subgraph(Gcc[0])

In [16]:
for comp in nx.connected_components(G0):
    print(comp)

{'Enid (disambiguation)', 'Ogden', 'Doris Purkiss', 'Lancelot (Healer)', 'Tenebrus', 'Libatius Borage', 'Uric the Oddball', 'Poppy Pomfrey', 'Percy Weasley', 'Fluffy', 'Ali Bashir', 'Magorian', 'Urg the Unclean', 'Fawkes', 'Vasily Dimitrov', 'Stubby Boardman', 'Ragnok', 'Arnold', 'Pomona Sprout', 'Enid Smeek', 'Stewart Ackerley', 'John Dawlish', 'Ciceron Harkiss', 'Neville Longbottom', 'Unicorn', 'Eileen Prince', 'Fabian Prewett', 'Eric Munch', 'Borgin', 'Golgomath', 'Urquhart', 'Gregory the Smarmy', 'Xenophilius Lovegood', 'Tofty', 'Natalie McDonald', 'Abraxas Malfoy', 'Barnabas Deverill', 'Leprechaun', 'Reginald Cattermole', 'Patricia Stimpson', 'Rabastan Lestrange', 'Dai Llewellyn', 'Godelot', 'Father Christmas', 'Miranda Goshawk', 'Snatchers', 'Wendelin the Weird', 'Jugson', 'Evan Rosier', 'Yvonne', 'Aurora Sinistra', 'Lev Zograf', 'Marvolo Gaunt', 'Basil', 'Ivor Dillonsby', 'Bathilda Bagshot', 'Charles Weasley', 'Percival Dumbledore', 'Dudley Dursley', 'Benjy Fenwick', 'Fridwulfa'

In [17]:
with open(f"data/processed/series/graph_wo_trio.json", "w") as f:
    json.dump(json_graph.node_link_data(G0), f)

In [18]:
with open(f"data/processed/series/graph_wo_trio.json", "w") as f:
    json.dump(nx.cytoscape_data(G0)  , f)