In [None]:
import os
import nltk
import csv
import math

import matplotlib.pyplot as plt

import networkx as nx

from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud

from bs4 import BeautifulSoup
from community import community_louvain
from fa2 import ForceAtlas2

In [None]:
# Create character tuple list

characters = []

with open("HP_characters.csv", "r", encoding="utf8") as sent_file:
    csv_file = csv.reader(sent_file, delimiter=",")
    for row in csv_file:
        name = row[0].replace(' ', '_')
        # lower to make sure they are spelled the same way
        parentage = row[1].lower()
        house = row[2].lower()
        occupation = row[3].lower()
        characters.append((name, parentage, house, occupation))

# Graph with weights
Making a graphs for each book. The nodes are the characters in the book, and edges are between characters that are in the same chapter. Edges have weight corresponding to the number of times those two characters are in the same chapter. Nodes have the attributes parentage, house and occupation. 

In [None]:
"""
Input: A list of character names with their attributes, 
       the path of the book,
       how many sentences to look at at time
Output: A weighted graph
"""
def weighted_temporal_graphs(character_list, path, sentence_no):
    
    G = nx.Graph()
  
    # Go throug each chapter in the book
    for chapter in os.listdir(path): 
        #only look at the files where aliases have been replaced with character names
        if "replaced" in chapter:   

            # Get text
            with open(path + chapter) as f:
                text = f.read()
            
            # Put all characters from the chapter in the graph if they are not already there
            for character in character_list:
                if character[0] in text and character[0] not in list(G.nodes):
                    G.add_node(character[0], parentage = character[1], 
                               house = character[2], occupation = character[3])
            
            # Split the text in sentences 
            sentences = text.split(". ")
            count_start = 0
            count_end = sentence_no
            
            # Look at specified amount of senteces at a time
            while (count_start < len(sentences)):
                current = sentences[count_start:count_end]
                current = " ".join(current)
                
                # Go through the nodes and check if two diffferent nodes appear in the same text piece
                # if so add an edge
                # weight is the amount of times they appear together throughout the book
                for character_source in list(G.nodes):
                    #print(character_source)
                    for character_target in list(G.nodes):
                        #print(character_target)
                        if character_source is character_target:
                            continue
                        elif (character_source in current and character_target in current):
                            if G.has_edge(character_source, character_target):
                                G[character_source][character_target]['weight'] += 1
                            else:
                                G.add_edge(character_source, character_target, weight=1)
                            #print("added some edge")
                count_start = count_end
                count_end += sentence_no
    # Remove nodes without edges
    G.remove_nodes_from(list(nx.isolates(G)))
    #print("Done with graph")
    return G

In [None]:
#https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.operators.binary.compose.html#networkx.algorithms.operators.binary.compose

"""
Input: two graphs to combine
Output: the combined graph, including the weights of the edges added together if same edges
"""

def combine_graphs(g1, g2):
    combined= nx.compose(g1, g2)
    edge_data = {e: g1.edges[e]['weight'] + g2.edges[e]['weight'] 
                 for e in g1.edges & g2.edges}
    nx.set_edge_attributes(combined, edge_data, 'weight')
    
    return combined


In [None]:
""" 
Input: a graph
Output: A list of of the summed weights for the edges for each node,
        this list is ordered as the list of nodes returned from graph.nodes
"""
def get_weight_sums(graph):
    weight_sums = []
    for node in list(graph.nodes):
        sum = 0
        for source, target in list(graph.edges):
            if node is source or node is target:
                sum += graph[source][target]["weight"]
        weight_sums.append(sum)
    return weight_sums

In [None]:
"""
Input: a graph,
       title for the plot, default is empty string
Output: plots the graph using forceAtlas

"""

def draw_network(graph, title=""):
    # Adjusting figure size
    plt.rcParams['figure.figsize'] = [10, 10]

    forceatlas2 = ForceAtlas2(
                            # Behavior alternatives
                            outboundAttractionDistribution=True,  # Dissuade hubs
                            linLogMode=False,  # NOT IMPLEMENTED
                            adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                            edgeWeightInfluence=1.0,

                            # Performance
                            jitterTolerance=1.0,  # Tolerance
                            barnesHutOptimize=True,
                            barnesHutTheta=1.2, # original 1.2
                            multiThreaded=False,  # NOT IMPLEMENTED

                            # Tuning
                            scalingRatio=2.0,
                            strongGravityMode=True,
                            gravity=0.1, # original 0.5

                            # Log
                            verbose=True)

    positions = forceatlas2.forceatlas2_networkx_layout(graph, pos=None, iterations=2000)
    nx.draw_networkx_nodes(graph, positions, node_color="red", node_size=get_weight_sums(graph), alpha=0.4)
    nx.draw_networkx_edges(graph, positions, edge_color="green", alpha=0.05)
    plt.axis('off')
    plt.title(title)
    plt.figtext(.5, -0.05, f"The size of a note indicates the sum of its weights.", ha="center")

    plt.show()

In [None]:
# Making graphs of book 1 where the no of senteces are changed

graph_list = []
#(character_list, path, graph_list, sentence_no):
graph_list.append(weighted_temporal_graphs(characters, "B1/", 5))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 10))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 20))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 30))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 40))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 50))



In [None]:
# plotting the networks with different no of sentences
sentence_len = [5, 10, 20, 30, 40, 50]

for i, graph in enumerate(graph_list):
    draw_network(graph, "Book one network with interval of "+str(sentence_len[i])+" sentences")

In [None]:
# Making network separately for each book

book_graphs = []

book_graphs.append(weighted_temporal_graphs(characters, "B1/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B2/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B3/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B4/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B5/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B6/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B7/", 5))

In [None]:
# Drawing network of each book
for i, graph in enumerate(book_graphs):
    draw_network(graph, "Network of book " + str(i+1))

In [None]:
# Combining the networks of the books
combined_nx = [book_graphs[0]]
combined_nx.append(combine_graphs(combined_nx[0], book_graphs[1]))
combined_nx.append(combine_graphs(combined_nx[1], book_graphs[2]))
combined_nx.append(combine_graphs(combined_nx[2], book_graphs[3]))
combined_nx.append(combine_graphs(combined_nx[3], book_graphs[4]))
combined_nx.append(combine_graphs(combined_nx[4], book_graphs[5]))
combined_nx.append(combine_graphs(combined_nx[5], book_graphs[6]))

In [None]:
# Drawing network of combined books
for i, graph in enumerate(combined_nx):
    title = ''
    if i == 0:
        title = "Network of book 1"
    else:
        title = "Network of book 1-" + str(i+1) 
    draw_network(graph, title)

In [None]:
# Print number of nodes and edges in each combined network

for i, graph in enumerate(combined_nx):
    title = ''
    if i == 0:
        title = " edges in the network of book 1"
    else:
        title = " edges in the network of book 1-" + str(i+1) 
    print('There are ' +str(graph.number_of_nodes()) + ' and ' + str(graph.number_of_edges()) + title)

In [None]:
# Starting on making a plot to show how nodes and edges increase throughout the book series
no_nodes = []
no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]
for graph in combined_nx:
    no_nodes.append(graph.number_of_nodes())
    no_edges.append(graph.number_of_edges())

ax1 = plt.subplot()
l1, = ax1.plot(networks, no_nodes, color='red')
ax2 = ax1.twinx()
l2, = ax2.plot(networks, no_edges, color='blue')
ax1.set_xlabel('No. of books combined')
ax1.tick_params(axis="y", labelcolor='red')
ax2.tick_params(axis="y", labelcolor='blue')
ax1.set_ylabel("No. of nodes")
ax2.set_ylabel("No. of edges")

plt.legend([l1, l2], ["No. of nodes", "No. of edges"])

plt.show()

In [None]:
# HP, RW, HG, AD, Snape, Voldemort, Hagrid, draco malfoy, ginny weaslye, necille longbottom

# PLotting number of edges belong to ten characters throughout the series:
char = ["Harry_Potter", "Ronald_Weasley", "Hermione_Granger", "Albus_Dumbledore", "Severus_Snape", "Tom_Riddle", 
        "Rubeus_Hagrid", "Draco_Malfoy", "Ginevra_Weasley", "Neville_Longbottom"]

no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]

for c in char:
    edges = []
    for graph in combined_nx:
        edges.append(len(list(graph.edges(c))))
    no_edges.append(edges)

for i, e_list in enumerate(no_edges):
    plt.plot(networks, e_list, label = char[i].replace('_', ' '))
plt.xlabel("No. of books combined")
plt.ylabel("No. of edges")
plt.title("Evolution of the network of ten main characters")
plt.legend(fontsize=8)
plt.show()

## Communities

In [None]:
"""
Input: a graph to divide in communities
Output: A list with the different communities
"""

def communities(graph):
    partition = community_louvain.best_partition(graph)
    #print(partition)
    partition_list = []
    
    for com in set(partition.values()) :
        list_nodes = [nodes for nodes in partition.keys()
                                if partition[nodes] == com]
        partition_list.append(list_nodes)
    partition_list = sorted(partition_list, key=len, reverse=True)
    #print(partition_list)
    return partition_list
    

In [None]:
"""
Input: a community and the graph it is extracted from
Output: A dictionary of all the characters in the community,
        the parentages, houses and occupations belonging to the characters.
        Each character gets a value according to their sum
        Each parentage, house and accupation is summed up for the total no. of characters belonging
        to that parentage, house or occupation.
"""

#Make dictionary wordclouds with the character names and their weights
# equal attributes are summarized

def wordcloud_dict(community, graph):
    cloud_freq = {}   
    subG = graph.subgraph(community)
    nodes = list(subG.nodes)
    weights = get_weight_sums(subG)
    parentages = nx.get_node_attributes(subG, 'parentage')
    houses = nx.get_node_attributes(subG, 'house')
    occupations = nx.get_node_attributes(subG, 'occupation')
    
    
    for character in community:
        parentage = parentages[character]
        house = houses[character]
        occupation = occupations[character]
        
        cloud_freq[character.replace('_', ' ')] = weights[nodes.index(character)]
        
        if parentage != 'other':
            if parentage in cloud_freq:
                cloud_freq[parentage] = cloud_freq.get(parentage) + 1
            else:
                cloud_freq[parentage] = 1
        
        if house != 'other':
            if house in cloud_freq:
                cloud_freq[house] = cloud_freq.get(house) + 1
            else:
                cloud_freq[house] = 1
        
        if occupation != 'other':
            if occupation in cloud_freq:
                cloud_freq[occupation] = cloud_freq.get(occupation) + 1
            else:
                cloud_freq[occupation] = 1
    
    return cloud_freq

In [None]:
"""
Input: a list of communities and the graph they're extracted from
Output: a list with a dictionary for each community
"""

def make_com_dicts(com_list, graph):
    com_dicts = []

    for com in com_list:
        com_dicts.append(wordcloud_dict(com, graph))
    
    return com_dicts
    

In [None]:
"""
Input: a list of dictionaries
Output: Wordclouds plotted for the community dictionaries given
"""
def draw_word_cloud(dicts):
    fig = plt.figure()
    plt.rcParams['figure.figsize'] = [15, 20]

    for i in range(len(dicts)):
        ax = fig.add_subplot(5,2,i+1)
        wordcloud = WordCloud(background_color='black', width=2200,
                          height=1800, collocations=False).generate_from_frequencies(dicts[i])

        ax.imshow(wordcloud)
        ax.axis('off')
    plt.show()

In [None]:
# Communitites from all books
all_communities = communities(combined_nx[6])

In [None]:
print(len(all_communities))

In [None]:
# communities in each book:

book_communities = []

for graph in book_graphs:
    book_communities.append(communities(graph))
    
#print(len(book_communities))

In [None]:
# print which book, the communities and the size of each community

for i, com in enumerate(book_communities):
    print('Book no.: ' + str(i+1))
    print(com)
    for part in com:
        print(len(part))

In [None]:
# Wordclouds for all books combined
dictionary = make_com_dicts(all_communities, combined_nx[6])
draw_word_cloud(dictionary)

In [None]:
#Wordclouds for book 1:
dictionary = make_com_dicts(book_communities[0], book_graphs[0])
draw_word_cloud(dictionary)

In [None]:
#Wordclouds for book 2:
dictionary = make_com_dicts(book_communities[1], book_graphs[1])
draw_word_cloud(dictionary)

In [None]:
#Wordclouds for book 3:
dictionary = make_com_dicts(book_communities[2], book_graphs[2])
draw_word_cloud(dictionary)

In [None]:
#Wordclouds for book 4:
dictionary = make_com_dicts(book_communities[3], book_graphs[3])
draw_word_cloud(dictionary)

In [None]:
#Wordclouds for book 5:
dictionary = make_com_dicts(book_communities[4], book_graphs[4])
draw_word_cloud(dictionary)

In [None]:
#Wordclouds for book 6:
dictionary = make_com_dicts(book_communities[5], book_graphs[5])
draw_word_cloud(dictionary)

In [None]:
#Wordclouds for book 7:
dictionary = make_com_dicts(book_communities[6], book_graphs[6])
draw_word_cloud(dictionary)

### Probably not gonna be used. It is if we want to extract text belonging to the character in a community.

In [None]:
#Putting together the text from the pages belonging to each community
community_texts = []
maxrange = 0
#if we have less than 10 communities
if len(partition_list) < 10:
    maxrange = len(partition_list)
else:
    maxrange = 10
    
for sublist in partition_list[:maxrange]:
    com_txt = []
    for character in sublist:
        f = open("./characters/"+character+".txt")
        raw = f.read()
        tokens = nltk.wordpunct_tokenize(BeautifulSoup(raw, 'html.parser').get_text())
        file_text = [w.lower() for w in tokens if w.isalpha()]
        com_txt = com_txt + file_text
    community_texts.append(com_txt)

stopwords = nltk.corpus.stopwords.words('english')

community_strings = []
for txt in community_texts:
    com_words = [w for w in txt if w not in stopwords]
    community_strings.append(com_words)

In [None]:
unique_terms = []
for community_words in community_strings:
    unique_terms.append(list(set(community_words)))

In [None]:
# communities from fandom
f = open("communities_from_fandom.csv", "w")
for character, community in list(partition.items()): 
    f.write(character + "," + str(community) + "\n")
f.close()

# TF-IDF

In [None]:
def idf(word, unique_list):
    N = len(unique_list)
    term_appears = 0
    for sublist in unique_list:
        if word in sublist:
            term_appears+=1
    idf_val = math.log(N/(1+term_appears))+1
    return idf_val

In [None]:
# Creating the tf list
tfidf_list = []

for community_words in community_strings:
    fdist = FreqDist(community_words)
    total_terms = len(community_words)
    tfidf=[]
    for word in fdist:
        idf_val = idf(word, unique_terms)
        tf_val = fdist[word]/total_terms
        tfidf_elem=(word, tf_val*idf_val)
        tfidf.append(tfidf_elem)
    tfidf_list.append(tfidf)


In [None]:
fig = plt.figure()
plt.rcParams['figure.figsize'] = [15, 20]

for i in range(len(tfidf_list)):
    ax = fig.add_subplot(5,2,i+1)
    wordcloud = WordCloud(background_color='black', width=2200,
                      height=1800, collocations=False).generate_from_frequencies(dict(tfidf_list[i]))

    ax.imshow(wordcloud)
    ax.axis('off')
plt.show()