**NOTE: This notebook is written for the Google Colab platform. However it can also be run (possibly with minor modifications) as a standard Jupyter notebook.** 

The notebook is inspired by similar analyses from other authors (such as [Character Networks Visualization for Les Misérables](https://studentwork.prattsi.org/infovis/labs/character-networks-visualization-for-les-miserables/) and [Game of Thrones – Co-occurrence Network of Characters](https://studentwork.prattsi.org/infovis/labs/game-of-thrones-co-occurrence-network-of-characters/)). However, the code is original and – unlike the other works – the entire process, including the plotting, is done in Python.



In [None]:
#@title -- Installation of Packages -- { display-mode: "form" }
import sys
!{sys.executable} -m pip install umap-learn python-louvain textblob
!{sys.executable} -m pip install git+https://github.com/michalgregor/class_utils.git

In [None]:
#@title -- Import of Necessary Packages -- { display-mode: "form" }
from collections import defaultdict
from sklearn.preprocessing import minmax_scale, MinMaxScaler
import numpy as np
import pandas as pd
import itertools
import nltk
import re

import community # package python-louvain
import networkx as nx
from umap import UMAP
from textblob import TextBlob

from IPython.display import display
from matplotlib.colors import to_hex
import matplotlib.pyplot as plt
import ipywidgets as widgets

In [None]:
#@title -- Downloading Data -- { display-mode: "form" }
from class_utils.download import download_file_maybe_extract
download_file_maybe_extract("https://www.dropbox.com/s/424vr9du2f480d9/three_musketeers.txt?dl=1", directory="data")

# also create a directory for storing any outputs
import os
os.makedirs("output", exist_ok=True)

# We also need some data from the nltk package
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [None]:
#@title -- Auxiliary Functions -- { display-mode: "form" }
def refine_tags(tree, words, case_sensitive=True, infuse_text=False, tag=None):
    class NOTHING: pass
    
    if isinstance(words, set):
        words = {it: tag for it in words}
    
    if infuse_text:
        make_item = lambda word, tag: (word, "{}:{}".format(tag, word))
    else:
        make_item = lambda word, tag: (word, tag)
    
    if case_sensitive:
        normalize = lambda x: x
    else:
        normalize = lambda x: x.lower()
        words = {normalize(k): v for k,v in words.items()}
        
    for item in tree:
        if isinstance(item, tuple):
            word, tag = item
            words_tag = words.get(normalize(word), NOTHING())
            
            # not in dict: yield the original item
            if isinstance(words_tag, NOTHING):
                yield item
            # tag is None: do not change the orginal tag
            elif words_tag is None:                
                yield make_item(word, tag)
            # change the tag to words_tag
            else:
                yield make_item(word, words_tag)
        else:
            yield nltk.Tree(item.label(), refine_tags(item, words))

def refine_chunks(chunked_sents, wordset, grammar, case_sensitive=True,
                  infuse_text=False, tag=None):    
    refined = (list(refine_tags(sent, wordset,
                                case_sensitive=case_sensitive,
                                infuse_text=infuse_text, tag=tag))
                    for sent in chunked_sents)
    rechunked = nltk.RegexpParser(grammar).parse_sents(refined)
    return rechunked

def mix_colors(colors):
    return tuple(np.asarray(colors).mean(axis=0))

def compute_cooccurence(entity_occurences, context=15, multi_count=False):
    cooccurence = {}

    for (ent1, occ1), (ent2, occ2) in itertools.combinations(
        entity_occurences.items(), 2
    ):
        num_occ = 0
        jstart = 0
        
        # for our algorithm to work, we need to make sure the occurrence sequences are sorted
        occ1 = sorted(occ1)
        occ2 = sorted(occ2)
      
        for i in range(len(occ1)):
            for j in range(jstart, len(occ2)):
                if occ1[i] >= occ2[j] - context and occ1[i] <= occ2[j] + context:
                    # we have found a co-occurence
                    num_occ += 1
                    
                    if not multi_count:
                        # once we get a match, we increment i and set jstart
                        # j + 1 so that we do not count the same co-occurrence
                        # more than once
                        jstart = j + 1
                        continue
                
                elif occ1[i] < occ2[j] - context:
                    # we continue with the next i, since occ2[j]
                    # will only get larger
                    break
                    
                elif occ1[i] > occ2[j] + context:
                    # occ1[i] will only get larger, so we do not need to
                    # consider this j in future iterations
                    jstart = j + 1

        cooccurence[(ent1, ent2)] = num_occ

    return cooccurence
  
color_list = np.array([
    (0, 150, 117),
    (0, 196, 255),
    (115, 192, 0),
    (255, 85, 132),
    (204, 173, 170),
    (155, 116, 216),
    
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
    (150, 150, 150),
]) / 255.0

## Character Networks based on the Number of Co-occurrences

In this notebook we are going to perform an interesting analysis, which joins approaches from natural language processing on one hand and from graph theory and network analysis on the other hand. Our goal will be to create a network of characters from a given book, to perform analyses on it and to plot the results. The "closeness" of the characters in the network will be determined by their number of co-occurences in the same context. Once we have formed the network, we will show how to compute various things, such as the centrality of the nodes, how to detect communities (groups of characters, which are somehow interrelated), etc. This kind of analysis has a lot of practical applications these days: including social network analysis, for an instance.

Our overall procedure will be as follows:

* To identify named entities in the text: in our case we will care about the book's characters.
* Determine their closeness by counting how many times they co-occur in the same context.
* Construct a network using this information.
* Analyze the network and visualize the results.
### Loading the Text

The first step, of course, will be to load the text of the book. Since we are only working with a single book, the volume of our textual data will not be too large and we can afford to read all the data into memory at once. Otherwise we would be forced to split it into smaller chunks and process it piece by piece.



In [None]:
with open('data/three_musketeers.txt', 'r', encoding="utf8") as f:
    sample = f.read()

### Preprocessing the Text

The next step is to preprocess the text. We will first apply some very basic standardization: e.g. by replacing various special kinds of quotation marks and other punctuation using some canonical kinds. We will then split the text into individual sentences. The sentences will be tokenized: split into smaller units such as words and phrases.



In [None]:
# We replace special quotation marks ‘’ using simple quotation marks '.
trans_table = str.maketrans("‘’", "''")
sample = sample.translate(trans_table)

# We split the text into sentences.
sentences = nltk.sent_tokenize(sample)

# We print a few sentences as an example.
for sent in sentences[15:18]:
    print(sent, "\n")

In [None]:
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

for tok_sent in tokenized_sentences[15:18]:
    print(tok_sent, "\n")

We will then tag all the tokens, which gives us additional information about their role in the sentence – e.g. whether they represent a noun, a verb, and so on.



In [None]:
tagged_sentences = nltk.pos_tag_sents(tokenized_sentences)

for tag_sent in tagged_sentences[15:18]:
    print(tag_sent, "\n")

We will then try to extract more information about the structure of the text using chunking.



In [None]:
chunked_sentences_sample = nltk.ne_chunk_sents(tagged_sentences[15:18])

for chunked_sent in chunked_sentences_sample:
    print(chunked_sent, "\n")

### Entity Recognition

Having applied chunking, we are ready to do named entity recognition. Given that the `nltk` package uses one of the traditional methods, which are not known for their excellent accuracy or robustness, and also given the character of each particular text, it may be necessary to apply some additional manual corrections after entity recognition, or even to go back and preprocess the input data some more to get good results.

In any case, we will also need to keep track of the position where each entity occurred, so that we are later able to compute the number of co-occurrences.



In [None]:
# We apply named entity recognition.
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences)

# If there is a title before the name of an entity, we will try to pull it in.
titles = {"professor", "madame", "madam", "mr", "mr.", "mrs", "mrs.",
          "miss", "uncle", "aunt", "lord"}
rechunked_sentences = refine_chunks(chunked_sentences, titles, tag="TIT",
                          grammar = r"""
                              PERSON: {<TIT><PERSON>}
                          """,
                          case_sensitive=False)

# We collect all the occurrences of the individual entities.
# We also keep track of the final form of our pre-processed text.
entity_occurences = defaultdict(list)
chunk_list = []
num_chunks = 0

for sent_tree in rechunked_sentences:    
    for node in sent_tree:
        # a regular tagged token
        if isinstance(node, tuple):
            chunk_list.append(node[0])
        
        # a sub-tree corresponding to an entity
        else:
            identifier = " ".join((leaf[0] for leaf in node.leaves())).strip()    
            entity_occurences[identifier].append(num_chunks)
            chunk_list.append(identifier)
        
        num_chunks += 1

### Merging Different Names of the Same Entity

Naturally, we cannot expect that the entity recognition process will be fully automatic. It is common for some book characters to be referenced by several different names (nicknames, enderaments, etc.). It is usually not going to be possible to merge these different names of the same entity correctly without having a full understanding of the text (which the existing methods just don't have). We will therefore have to do at least part of this work by hand.

We will therefore display the recognized named entities, sorting them according to the nubmer of occurences. Where necessary, we will merge several different names into a single entity. This merging could be partly automated, e.g. if we had some other source of information, such as a web page containing a list of characters will all their different nicknames.



In [None]:
for k, v in sorted(entity_occurences.items(), key=lambda it: -len(it[1])):
    print("{}x\t{}".format(len(v), k))

We will do the manual merging using a dictionary of the following form:

```
entity_dict = {
    'entity_identifier': {'Entity's Name', 'Alternative Name 1', 'Alternative Name 2', ...},

    ...
}
```
Entities that occur very few times may be unimportant: we can drop them from the dictionary completely at this stage, if we so wish.



In [None]:
entity_dict = {
    "Athos": {"Athos", "Monsieur Athos"},
    "Milady": {"Milady", "Winter", "MILADY"},
    "Porthos": {"Porthos", "Monsieur Porthos", "PORTHOS"},
    "Aramis": {"Aramis", "Monsieur Aramis", "ARAMIS"},
    "Felton": {"Felton"},
    "Bonacieux": {"Bonacieux", "Madame Bonacieux", "Monsieur Bonacieux"},
    "Treville": {"Treville"},
    "Planchet": {"Planchet"},
    "Buckingham": {"Buckingham"},
    "Grimaud": {"Grimaud"},
    "Bazin": {"Bazin"},
    "Mousqueton": {"Mousqueton"},
    "La Rochelle": {"La Rochelle"},
    "Richelieu": {"Richelieu", "Eminence", "Cardinal", "Monsieur Cardinal"},
    "d'Artagnan": {"Gascon"},
    "Rochefort": {"Rochefort"},
    "de Chevreuse": {"Madame de Chevreuse"},
    "Madame Coquenard": {"Madame Coquenard", "Coquenard"},
    "Monsieur Dessessart": {"Monsieur Dessessart"},
    "Louis XIII": {"Louis XIII", "Louis"},
    "Louis XIV": {"Louis XIV"},
}

We will now invert the dictionary (so that it maps all the alternative names to the same character ID).



In [None]:
reverse_entity_dict = defaultdict(set)

for entity, names in entity_dict.items():
    for name in names:
        reverse_entity_dict[name].update({entity})
        
reverse_entity_dict = dict(reverse_entity_dict)

If necessary, we can test whether we have perhaps forgotten any important entities:



In [None]:
forgotten_entity_occurences = {k: v for k, v in entity_occurences.items() if not k in reverse_entity_dict}

for k, v in sorted(forgotten_entity_occurences.items(), key=lambda it: -len(it[1])):
    print("{}x\t{}".format(len(v), k))

We transform the list of occurrences using the inverted entity dictionary.



In [None]:
translated_occurences = defaultdict(list)

for entity, occurences in entity_occurences.items():
    try:
        for translated_entity in reverse_entity_dict[entity]:
            translated_occurences[translated_entity].extend(occurences)
    except KeyError:
        pass

We use a predefined function to find the co-occurrences:



In [None]:
cooccurence = compute_cooccurence(translated_occurences)

We store the results in CSV files: the nodes (the entities) and the eges (labelled by the numbers of co-occurrences) go into separate files.



In [None]:
with open("output/nodes.csv", "w") as nodes_file:
    nodes_file.write("Id,Label,Occurences\n")
   
    for entity, occurences in translated_occurences.items():
        nodes_file.write("{},{},{}\n".format(entity, entity, len(occurences)))

with open("output/edges.csv", "w") as edges_file:
    edges_file.write("Source,Target,Type,id,weight\n")
   
    for i, ((ent1, ent2), num_cooccur) in enumerate(cooccurence.items()):
        if num_cooccur > 0:
            edges_file.write("{},{},Undirected,{},{}\n".format(ent1, ent2, i, num_cooccur))

### Constructing, Analyzing and Visualizing the Graph

We will now construct the graph that has resulted from our textual analysis and apply graph analysis to it. We will do all these steps in Python. However, it would be just as easy to use the CSV files that we have created and build the graph using an external tool, such as [gephi](https://gephi.org/), the well-known software for graph visualization.

Let us now create the graph using the Python package `networkx` and add all the nodes found in `nodes.csv`. We are interested in the nodes' id, character name (to be used as label) and the total number of occurences of the character in text, so we add these as data to the node:



In [None]:
G = nx.Graph()

nodes = pd.read_csv("output/nodes.csv")
for nid, (node, label, occurences) in nodes[["Id", "Label", "Occurences"]].iterrows():  
    G.add_node(node, label=label, occurences=occurences)

Next we read the `edges.csv` file and add all the edges to the graph. We weight the edges by the number of co-occurrences, as recorded in the CSV file:



In [None]:
edges = pd.read_csv("output/edges.csv")
G.add_weighted_edges_from((
       (src, tgt, w)
       for i, (src, tgt, w) in
       edges[["Source", "Target", "weight"]].iterrows()
))

If the graph contains some isolated nodes (nodes that have no edges associated to them), we remove them to make the graph more readable. We also convert node labels to integers to make them easier to work with:



In [None]:
G.remove_nodes_from(list(nx.isolates(G)))
G = nx.convert_node_labels_to_integers(G)

#### Page-Rank to Determine Node "Importance"

When visualizing graphs there is a lot of ways to encode relevant information about the graph into the plot. One of these is to use the size of the nodes. In our case we will compute the centrality of each node using an indicator known as the page-rank. Loosely speaking, this will give us a measure of how important the node is in the graph. Once we have the page-ranks, we will rescale them into a reasonable range and base the size of the nodes on the results. We will also use the same approach to scale the fonts of the nodes' labels.



In [None]:
page_rank = nx.pagerank(G)
sizes = [page_rank[node] for node in G.nodes()]
sizes = minmax_scale(sizes, (80, 1000))

font_scaler = MinMaxScaler((12, 20))
font_scaler.fit(np.reshape(sizes, (-1, 1)))

#### Width and Transparency of the Edges

To determine the width of the edges, we will merely rescale the edges' weights: i.e. the numbers of co-occurrences. The same values, but at a different scale will be used to determine the transparency of the edges. Making some of the edges less solid will have the effect of visually decluttering the plot.



In [None]:
widths = [edge[2] for edge in G.edges(data='weight')]
widths = minmax_scale(widths, (1, 5))
edge_alpha = minmax_scale(widths, (0.3, 1))

#### Detection of Communities in the Graph

Next, we will use community detection from package `community` to partition nodes into communities. Roughly speaking, it is possible to think about this as a graph equivalent of clustering.



In [None]:
parts = community.best_partition(G, resolution=1.0)
num_parts = np.max(list(parts.values()))

communities = [[] for i in range(num_parts+1)]
for k, v in parts.items():
    communities[v].append(k)

The identified communities will be used to determine node and edge colours:



In [None]:
colors = np.asarray([to_hex(color_list[parts.get(node)]) for node in G.nodes()])
cmap = lambda c: color_list[c]

edge_colors = [to_hex(
    mix_colors([cmap(parts.get(src)), cmap(parts.get(dest))]),
      keep_alpha=True) for src, dest, w in G.edges(data='weight')]

#### The 2-D Layout

Since we are laying out of graph in a 2-dimensional plot, we will naturally need to assign some positions to all the nodes. Ideally, this should be done so that closely related nodes are plotted near each other and the edges do not overlap too much. There is a bunch of methods for finding graph layouts. However, not all of them will yield good results when the graphs are complex and there is a large number of connections. To find our layout, we will therefore slightly misuse UMAP – a method primarily intended for dimensionality reduction – and we will compute our layout using it.

UMAP takes the distances between the nodes as its input. In our case, we will produce a distance matrix by inverting the weights of the edges (the numbers of co-occurrences). This will make UMAP group nodes that co-occur a lot together.

To get the nodes from the same community to stick together, we pick the initial positions of all the nodes so that the communities are mapped to equidistant points along the circumference of a circle (all nodes that belong to the same community are initially mapped to that same position).



In [None]:
# set up the distance matrix for UMAP
num_nodes = G.number_of_nodes()
dist_mat = np.zeros([num_nodes, num_nodes])
max_weight = edges["weight"].max()

for n1, n2, w in G.edges(data="weight"):
    invdist = w / max_weight
    dist_mat[n1, n2] += invdist
    dist_mat[n2, n1] += invdist
dist_mat = dist_mat.max() - dist_mat

# set up the initial node positions for UMAP
num_communities = len(communities)
community_angles = np.asarray(
    [ic*2*3.14/num_communities for ic in range(num_communities)])
community_centers = np.stack(
    [np.sin(community_angles) * 100, np.cos(community_angles) * 100], axis=1)

init = np.zeros([num_nodes, 2])
for c, mem in enumerate(communities):
    for n in mem:
        init[n] = community_centers[c]

# run UMAP
umap = UMAP(
    metric='precomputed',
    init=init,
    min_dist=10,
    spread=50,
)

pos = umap.fit_transform(dist_mat)
pos = {ipos: p for ipos, p in enumerate(pos)}

#### Plotting the Graph



A nakoniec nezostáva už nič iné než vykresliť samotný graf.



In [None]:
plt.figure(figsize=[14, 10])

# nodes
nx.draw_networkx_nodes(
     G,
     pos = pos,
     node_size = sizes,
     node_color = colors,
     linewidths = 1,
)

# edges
edge_collection = nx.draw_networkx_edges(
    G,
    pos,
    width = widths,
    edge_color = edge_colors,               
)

# transparency of edges
edge_alpha_colors = [tuple(col[:3]) + (al,) for al, col
                        in zip(edge_alpha, edge_collection.get_colors())]
edge_collection.set_color(edge_alpha_colors)

# edge labels
text_collection = nx.draw_networkx_labels(G, pos,
    labels = {node[0]: node[1].replace(" ", "\n")
                  for node in G.nodes(data="label")})

# font sizes
for node, textobj in text_collection.items():
    rank = np.reshape([page_rank[node]], (1, -1))
    textobj.set_fontsize(font_scaler.transform(rank)[0])

# minor re-styling of the plot
plt.gca().collections[0].set_edgecolor("k")
plt.axis('off')
plt.tight_layout()

## Sentimentálny kontext postáv

Metódy analýzy textu nám umožňujú odhadovať aj sentiment textu (pozitívny, negatívny a pod.). Jednoduché prístupy ku analýze sentimentu implementuje napríklad balíček `TextBlob`. Presnejšie výsledky by bolo možné získať napríklad pomocou niektorej z metód založených na hlbokom učení, ale aj výsledky získané pomocou tejto metódy by mali na hrubú analýzu stačiť.

Skúsme teda extrahovať kontexty, v ktorých sa mená jednotlivých postáv vyskytujú, a určiť ich sentiment. V grafe si potom zobrazíme s akým prevládajúcim sentimentom sa jednotlivé postavy v knihe spájajú.



In [None]:
# context radius
blob_radius = 10

names = []
sentiments = []
num_occurences = []

# we extract context and accumulate their polarities
for ent, occurences in translated_occurences.items():
    sentiment = 0
    
    for occ in occurences:
        sentiment += TextBlob(" ".join(chunk_list[occ-blob_radius:occ+blob_radius])).polarity
        
    sentiment /= len(occurences)
    
    names.append(ent)
    sentiments.append(sentiment)
    num_occurences.append(len(occurences))

# we track the accumulated sentiment but also the number of occurrences
data = np.asarray([sentiments, num_occurences]).transpose()
entity_sentiments = pd.DataFrame(data, columns=["sentiment", "occurences"], index=names)

Vo výslednom grafe zobrazíme len entity, ktoré sa vyskytujú väčší počet krát. Entity s menším počtom výskytov okrem toho budeme vizualizovať transparentnejšou farbou, aby sme ich odlíšili.



In [None]:
s = entity_sentiments[entity_sentiments["occurences"] > 25]
s_occ = minmax_scale(np.log(s["occurences"].values), (0.3, 1.0))
s_vals = s["sentiment"].values
s_index = s["sentiment"].index

abs_max = np.abs(s_vals).max()
norm = plt.Normalize(vmin=-abs_max, vmax=abs_max)

plt.figure(figsize=(12, 6))
plt.bar(range(len(s_vals)), s_vals,
  color=[(0, 0, 1, occ) if val >= 0 else 
         (1, 0, 0, occ) for val, occ in zip(s_vals, s_occ)]
)

plt.xticks(range(len(s_vals)), s_index, rotation=90)
plt.ylabel("sentiment")

plt.subplots_adjust(left=0.12, right=0.9, top=0.9, bottom=0.3)
plt.grid(linestyle='--')
plt.gca().set_axisbelow(True)