# Data Collection
## Imports

In [None]:
import csv
import urllib.request
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from urllib.parse import quote
import os
import re
import nltk
from nltk.tokenize import word_tokenize, FreqDist
import math
import networkx as nx
from networkx.algorithms import community
from nltk.corpus import stopwords
from wordcloud import WordCloud
from bs4 import BeautifulSoup
from community import community_louvain
from fa2 import ForceAtlas2
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

## Choosing an online source
>We have decided to use the [fandom](https://harrypotter.fandom.com/wiki/Main_Page) wikipedia instead of the regular [wikipedia](https://www.wikipedia.org/) for finding articles for our characters. The main reason for this being that the regular wikipedia had multiple characters in one article, e.g. [Ministry of Magic](https://en.wikipedia.org/wiki/Ministry_of_Magic) contains 23 characters. Whereas the fandom wikipedia has a dedicated article for each character.

## Creating a list of characters
>To create a list of characters we combined [wikipedia's list of Harry Potter characters](https://en.wikipedia.org/wiki/List_of_Harry_Potter_characters), with the [half- and full-blood lists](https://harrypotter.fandom.com/wiki/Category:Individuals_by_parentage) from the fandom wikipedia, and the characters from [Buzzfeed's Harry Potter Character Quiz](https://www.buzzfeed.com/sarahaspler/there-are-over-700-harry-potter-characters-and-i). The reason for combining these were to include as many characters as possible, without having to go through all of the articles on the fandom wikipedia. However, we found that we did have to check all of the articles from the combined list manually, because some of the names in the list were not unique, had incorrect spelling, did not match the article name etc. Hence we decided on the following criteria for the final list:

- Characters are represented by the article name.
- Characters must be from the actual books.
- Characters must have an appearence in at least one book.


>To clarify a character has an appearence in a book if they are represented in a book by some interaction with other characters. This is contrary to characters who are only mentioned, which means that it may just be a case of another character saying their name in conversation with some third character. These criteria also weeds out characters that are only from video games, or the Fantastic Beasts franchise etc. The reason for this initial sorting is that we want to use the books, hence we are removing a lot of noise by not having characters that have no text related to them in the books. 

In [None]:
# Moving the characters from our .csv file into a list of tuples
characters = []

with open("HP_characters.csv", "r", encoding="utf8") as sent_file:
    csv = csv.reader(sent_file, delimiter=",")
    for row in csv:
        name = row[0].replace(' ', '_') 
        # Parentage, House,Occupation and loyalty will be our attributes in the nodes
        parentage = row[1]
        house = row[2]
        occupation = row[3]
        loyalty = row[4]
        characters.append((name, parentage, house, occupation, loyalty))

## Downloading the files
>We are using the API from the course to download the articles:

In [None]:
lengths = []

#https://www.reddit.com/r/learnpython/comments/muwu7v/scraping_fandomwiki_pages/
baseurl = "https://harrypotter.fandom.com/api.php?"
action = "action=query"

for character in characters:
    # Set up the query for the character
    title = "titles=" + character[0]
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)

    # Since we have checked the articles, we know that urlopen will succeed
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    lengths.append((character, len(wikitext)))

    try:
        jsonobj = json.loads(wikitext)
    except ValueError: 
        print(f'Decoding JSON has failed for {character}, moving on...')
    # Get the number for the article
    num = list(jsonobj['query']['pages'].keys())[0]
    # Get the wikitext
    wikitext = jsonobj['query']['pages'][num]['revisions'][0]['*']
    f_name = character[0] + '.txt'
    
    # Write to a file with that name
    with open("characters/" + f_name, 'w') as f:
        f.write(wikitext)

## Checking the files

In [None]:
# Looking at the top/bottom of characters by length of articles
# to see if it looks reasonable
print(sorted(lengths, key=lambda x: x[1])[:10])
print(sorted(lengths, key=lambda x: x[1])[-10:])

In [None]:
# Histogram for the lengths of the articles
# This was used in our video presentation
lengths_s = [x[1] for x in lengths]

number_of_bins = 10
histogram = np.histogram(lengths_s, number_of_bins)

x_values = histogram[1][:-1]
y_values = histogram[0]

plt.plot(x_values, y_values)
plt.title(f'Plot of length of articles')
plt.xlabel('Length')
plt.ylabel('Number of articles')
plt.figtext(.5,-0.05, f"text", ha='center')
plt.show()

print(f"Minimum length = {np.min(lengths_s)}")

In [None]:
# Histogram for the lengths of the articles with log-log axes
# This was used in our video presentation
number_of_bins = 10
histogram = np.histogram(lengths_s, number_of_bins)

x_values = histogram[1][:-1]
y_values = histogram[0]

plt.plot(x_values, y_values)
plt.title(f'Log-Log plot of length of articles')
plt.xlabel('Length')
plt.yscale('log')
plt.xscale('log')
plt.ylabel('Number of articles')
plt.figtext(.5,-0.05, f"text", ha='center')
plt.show()

print(f"Minimum length = {np.min(lengths_s)}")

In [None]:
print(f"Average length of an article: {sum(lengths_s)/len(lengths_s)}")

# Alias extraction
> To do out analysis we have to find all the aliases that each character can have in the books. The wiki fandom pages have a section with those listed. We extract those by using regular expressions. Then we create a dictionary with the found aliases and go through the book texts and replace all the aliases with the full character names. 
Some of the characters had aliases that weren't mentioned or that collapses with other characters, so as explained in the following code we had to do some adjustments to the dictionary manually. 

In [None]:
"""
Function to extract the aliases from the wiki fandom pages and save them in a dictionary 
with the alias as key and the full character name as value.

Input: A text (we use the wiki fandom text we have extracted),
       a dictionary for the found aliases,
       the name the aliases belong to
Output: None, the alias dictionary is global
       
"""
def find_aliases(text, alias_dict, name):
    #Find all aliases. Will be between |alias and |, can be on several lines
    aliases = re.findall(r'\|alias = (.(.|\s)*?)\|', text)
    
    if len(aliases)==0:
        return
    else:
        #Take the first match in the first group and split by new line so each alias becomes an element
        all_aliases = aliases[0][0].split("\n")
        
        #Find the aliases to keep
        for alias in all_aliases:
            #Don't keep the ones used for a disguise
            if "disguise" in alias:
                continue
            #Mudblood not an alias
            if "Mudblood" in alias:
                continue
            #Don't keep the ones used to tell others a wrong name 
            if "the name he told" in alias:
                continue
            if "the name she told" in alias:
                continue
            #If alias is empty string then don't keep
            if alias == '':
                continue
            #Cleaning of the aliases
            processed_alias = alias.split(" (")[0]
            if '{{' in processed_alias:
                processed_alias = alias.split("{{")[0]
            processed_alias = processed_alias.replace('*', '')
            processed_alias = processed_alias.replace('[[', '').replace(']]', '')
            processed_alias = processed_alias.replace('"', '')
            # This print is to see which aliases might belong to several characters
            if processed_alias in alias_dict:
                print("Processed alias: " + processed_alias + ", belonging to: " + name)
            else:
                alias_dict[processed_alias.lower()] = name   

In [None]:
alias_dictionary = {}
not_added = []
# For each character page in the directory, find the aliases
for file_ in os.listdir("./characters"):
    name = file_[:-4]
    with open("./characters/"+file_, 'r') as f:
        text = f.read()
    first_name = name.split("_")[0].lower()
    full_name = name.replace('_', ' ').lower()
    # If the first name is not unique, print so we can see who have duplicates
    if first_name in alias_dictionary:
        not_added.append((first_name, name))
    else:
        alias_dictionary[first_name] = name
    # The full name of Stebbins in the wiki fandom pages is Stebbins_(1990s_Hogwarts_student)
    # This full name won't be in the books
    if not first_name == 'stebbins':
        alias_dictionary[full_name] = name
    find_aliases(text, alias_dictionary, name)

# Print all the names not added and the alike ones to see who are in the dict instead    
for item in not_added:
    print("Dict has: " + alias_dictionary[item[0]] + " instead of: " + item[1])

### Notes to aliases/ first names that collapses:
**Following characters only appears in the last chapter of B7:**
- James Potter II 
- Edward Lupin
- Rose Granger-Weasley
- Lily L. Potter

- Frank Bryce only appears in first chapter of B4

For the above characters we will change dictionary when looking in the corresponding chapters

**Following characters has first names that are the same and appears in different books:**
- Marcus Flint (appears in B1, B2, B3) and Marcus Belby (appears in B6), equally important
- Graham Montague (in B3, B5) more important than Graham Pritchard (in B4)
- Frank Longbottom (in B5) more important than Frank Bryce (in B4)
- Avery II (in B4, B5) more important than Avery I (in B6)

**Following characters has first names that are the same and appears in the same books:**
- Graham Montague (in B3, B5) more important than Graham Montague's father (in B5) and mother (in B5)
- Hermione Granger (in all books) more important than Hermione Granger's Father(in B2, B5) and Hermione Granger's Mother (in B2, B5)
- Dennis Creevey (in B4-B6) more important than Dennis (in B5)
- Ernest Macmillan (in B2, B4-B7) more important than Ernest Prang (in B3, B6)
- Fat Friar (in B1, B2, B5) and Fat Lady (in all books) equally important
- Mary Cattermole (in B7) equally important as Mary Macdonald (in B7)
- Evans sister's father (in B7) equally important as Evans sister's mother (in B7)

For those not in the same books we will make dictionary to fit with book. 
For those in the same books we will have to find another solution and prioritize the most important characters. 

Mr. and Mrs. Dursley appears often like this. Needs to be replaced with Vernon_Dursley and Petunia_Dursley.
The Potters should be replaced with James_Potter_I and Lily_J._Potter



In [None]:
# adjusting dictionary manually to contain those characters that are most important or appears in most books
# Also some characters haven't had any aliases belonging to them in the wiki fandom pages, but do have aliases in
# the books, so these are put in here as well
alias_dictionary['james'] = 'James_Potter_I'
alias_dictionary['frank'] = 'Frank_Longbottom'
alias_dictionary['marcus'] = 'Marcus_Flint'
alias_dictionary['avery'] = 'Avery_II'
alias_dictionary['hermione'] = 'Hermione_Granger'
alias_dictionary['mr. dursley'] = 'Vernon_Dursley'
alias_dictionary['mrs. dursley'] = 'Petunia_Dursley'
alias_dictionary['mcgonagall'] = 'Minerva_McGonagall'
alias_dictionary['dumbledore'] = 'Albus_Dumbledore'
alias_dictionary['ernest'] = 'Ernest_Macmillan'
alias_dictionary['dennis'] = 'Dennis_Creevey'
alias_dictionary['mr mason'] = 'Mason'
alias_dictionary['mrs mason'] = "Mason's_wife"
alias_dictionary['pince'] = "Irma_Pince"
alias_dictionary['vulchanov'] = "Pyotr_Vulchanov"
alias_dictionary['volkov'] = "Ivan_Volkov"
alias_dictionary['ivanova'] = "Clara_Ivanova"
alias_dictionary['grubbly-plank'] = "Wilhelmina_Grubbly-Plank"
alias_dictionary['mrs. figg'] = "Arabella_Figg"
alias_dictionary['mrs figg'] = "Arabella_Figg"
alias_dictionary['hooch'] = "Rolanda_Hooch"
alias_dictionary['ollivander'] = "Garrick_Ollivander"
alias_dictionary['mr ollivander'] = "Garrick_Ollivander"
alias_dictionary['mr. ollivander'] = "Garrick_Ollivander"
alias_dictionary['quirrell'] = "Quirinus_Quirrell"
alias_dictionary['trelawney'] = "Sybill_Trelawney"
alias_dictionary[' tonks'] = " Nymphadora_Tonks"
alias_dictionary['belby'] = "Marcus_Belby"
alias_dictionary['montague'] = "Graham_Montague"
alias_dictionary['warrington'] = "Cassius_Warrington"
alias_dictionary['pritchard'] = "Graham_Pritchard"
alias_dictionary['gregorovitch'] = "Mykew_Gregorowitch"
alias_dictionary['bode'] = "Broderick_Bode"
alias_dictionary['carmichael'] = "Eddie_Carmichael"
alias_dictionary['mr granger'] = "Hermione_Granger's_father"
alias_dictionary['sinistra'] = "Aurora_Sinistra"
alias_dictionary['bole'] = "Lucian_Bole"
alias_dictionary['yaxley'] = "Corban_Yaxley"
alias_dictionary['greyback'] = "Fenrir_Greyback"
alias_dictionary['vector'] = "Septima_Vector"

#delete those first names that collapses with other characters or words and aren't able to find a solution to
#i.e. rose also in 'the sun rose' and evans also in evans sister's father. 
#check first if they are a key to avoid errors
if 'rose' in alias_dictionary:
    del alias_dictionary['rose']

if 'graham' in alias_dictionary:
    del alias_dictionary['graham']  

if 'mary' in alias_dictionary:
    del alias_dictionary['mary'] 

if 'evans' in alias_dictionary:
    del alias_dictionary['evans'] 

In [None]:
# remove generic words used as keys
if 'mr' in alias_dictionary:
    del alias_dictionary['mr']
if 'mrs' in alias_dictionary:
    del alias_dictionary['mrs']
if 'fat' in alias_dictionary:
    del alias_dictionary['fat']
if 'pig' in alias_dictionary:
    del alias_dictionary['pig']
if 'lord voldemort (self-proclaimed title and chosen name) ' in alias_dictionary:
    alias_dictionary['lord voldemort'] = 'Tom_Riddle'
    del alias_dictionary['lord voldemort (self-proclaimed title and chosen name) ']

#if a key is short, put white space aroung to make sure it won't be found as a substring in another word
# i.e al is a key but also appears in the word normal
for alias, name in alias_dictionary.items():
    if (len(alias)<4):   
            value = alias_dictionary[alias]
            alias_dictionary[' '+alias+' '] = alias_dictionary.pop(alias)
            alias_dictionary[' '+alias+' '] = " "+value+" "
# Some manually adjustments to problems we experienced when looking over the texts after replacements
alias_dictionary[' ron,'] = " "+value
alias_dictionary[" ron'"] = " "+value
alias_dictionary[' ron.'] = " "+value
alias_dictionary[' ron?'] = " "+value
alias_dictionary[' ron!'] = " "+value
value = alias_dictionary['ivan']
alias_dictionary[' ivan '] = alias_dictionary.pop('ivan')
alias_dictionary[' ivan '] = " "+value+" "
value = alias_dictionary['bill']
alias_dictionary[' bill '] = alias_dictionary.pop('bill')
alias_dictionary[' bill '] = " "+value+" "
value = alias_dictionary['stan']
alias_dictionary[' stan '] = alias_dictionary.pop('stan')
alias_dictionary[' stan '] = " "+value+" "
value = alias_dictionary['michael']
alias_dictionary[' michael '] = alias_dictionary.pop('michael')
alias_dictionary[' michael '] = " "+value+" "
value = alias_dictionary['dora']
alias_dictionary[' dora '] = alias_dictionary.pop('dora')
alias_dictionary[' dora '] = " "+value+" "
value = alias_dictionary['bella']
alias_dictionary[' bella '] = alias_dictionary.pop('bella')
alias_dictionary[' bella '] = " "+value+" "

In [None]:
#https://www.geeksforgeeks.org/python-program-to-sort-dictionary-by-key-lengths/

# Sort dictionary by the length of the key, so we replace those with longest names first
# This is to make sure we replace the longest names in the dictionary first, 
# so we won't get errors with wrong replacements
test_dict_list = sorted(list(alias_dictionary.items()), key = lambda key : len(key[0]), reverse = True)
  
# reordering to dictionary
alias_dictionary = {ele[0] : ele[1]  for ele in test_dict_list}
  
# printing result 
print(alias_dictionary.items())

In [None]:
# Replace names in chapters so they are called our character names and not aliases

def replace_aliases(srcdir, dstdir):
    for chap in os.listdir(srcdir):     
        # do not look in files where aliases already have been replaced
        if 'replace' in chap:
            continue
        clean_text = ''
        with open(srcdir+'/'+chap, 'r') as f:
                clean_text = f.read()
                f.close()
        
        # remove all tabs and new lines, lower all text 
        #replace a contraction of 2 names with both their names
        clean_text = clean_text.replace('\t', ' ').replace('\n', ' ').replace('  ', ' ')
        clean_text = clean_text.lower()
        clean_text = clean_text.replace('mr. and mrs. dursley', 'Vernon_Dursley and Petunia_Dursley')
        clean_text = clean_text.replace('the potters', 'James_Potter_I and Lily_J._Potter')
        clean_text = clean_text.replace('the dursleys', 'Vernon_Dursley and Petunia_Dursley')
        clean_text = clean_text.replace('mr weasley', 'Arthur_Weasley')
        clean_text = clean_text.replace('mrs weasley', 'Molly_Weasley')
        clean_text = clean_text.replace('mr and mrs mason', "Mason and Mason's_wife")
        clean_text = clean_text.replace('masons', "Mason and Mason's_wife")
        clean_text = clean_text.replace('mr and mrs montague', "Graham_Montague's_father and Graham_Montague's_mother")
        clean_text = clean_text.replace('weird sisters', "Myron_Wagtail and Donaghan_Tremlett")
        clean_text = clean_text.replace('the grangers', "Hermione_Granger's_father and Hermione_Granger's_mother")
        clean_text = clean_text.replace('mr and mrs granger', "Hermione_Granger's_father and Hermione_Granger's_mother")
        clean_text = clean_text.replace('the carrows', "Alecto_Carrow and Amycus_Carrow")
        clean_text = clean_text.replace('the lestranges', "Rabastan_Lestrange, Rodolphus_Lestrange and Bellatrix_Lestrange")                                
        
        # some change in dictionary as those characters only appears in one chapter 
        # and their names collapses with other characters
        if (srcdir == './Chapters_withouth_replacement/B7'):
            print(chap)
            if chap == 'B7_Ch37.txt':
                #print("in the very last chapter")
                alias_dictionary['james'] = 'James_Potter_II'
                alias_dictionary['edward'] = 'Edward_Lupin'
                alias_dictionary['rose'] = 'Rose_Granger-Weasley'
                alias_dictionary['lily'] = 'Lily_L._Potter'
                alias_dictionary['albus'] = 'Albus_Potter'
                alias_dictionary['wife'] = 'Astoria_Malfoy'
        if (srcdir == './Chapters_withouth_replacement/B1'):
            if chap == 'B1_Ch4.txt':
                alias_dictionary['mother and father'] = "Evans_sister's_father and Evans_sister's_mother"
        if (srcdir == '/Chapters_withouth_replacement./B5'):
            if chap == 'B5_Ch4.txt':
                alias_dictionary['portrait'] = "Walburga_Black"
        for alias, name in alias_dictionary.items():
            clean_text = clean_text.replace(alias, name)
        
        if (srcdir == '/Chapters_withouth_replacement./B7'):
            print(chap)
            if chap == 'B7_Ch37.txt':
                alias_dictionary['james'] = 'James_Potter_I'
                alias_dictionary['edward'] = 'Edward_Tonks'
                del alias_dictionary['rose']
                alias_dictionary['lily'] = 'Lily_J._Potter'
                alias_dictionary['albus'] = 'Albus_Dumbledore'
                del alias_dictionary['wife']
        if 'mother and father' in alias_dictionary:
            print("in " + srcdir + "chap " + chap + ", deleting m and f")
            del alias_dictionary['mother and father']
        
        if 'portrait' in alias_dictionary:
            print("in " + srcdir + "chap " + chap + ", deleting portrait")
            del alias_dictionary['portrait']
            
        # Write to a file with that name
        filename = chap.split('_')[1].lower()
        dst_filename = dstdir+'/replaced_' + filename

        with open(dst_filename, 'w') as f:
                f.write(clean_text)

In [None]:
# Loop over chapters, will do it book by book as we have some places where we need to change dictionary
   
#Book 1
replace_aliases('./Chapters_withouth_replacement/B1', './B1')

#Book 2
replace_aliases('./Chapters_withouth_replacement/B2', './B2')

#Book 3
alias_dictionary[' pig '] = 'Pigwidgeon'
alias_dictionary['ernest'] = 'Ernest_Prang'
replace_aliases('./Chapters_withouth_replacement/B3', './B3')
alias_dictionary['ernest'] = 'Ernest_Macmillan'

#Book 4
alias_dictionary['frank'] = 'Frank_Bryce'
replace_aliases('./Chapters_withouth_replacement/B4', './B4')
alias_dictionary['frank'] = 'Frank_Longbottom'

#Book 5
alias_dictionary['mr and mrs potter'] = "Fleamont_Potter and Euphemia_Potter"
alias_dictionary['rose'] = "Rose_Zeller"
alias_dictionary['dennis'] = 'Dennis'
replace_aliases('./Chapters_withouth_replacement/B5', './B5')
del alias_dictionary['rose']
del alias_dictionary['mr and mrs potter']
alias_dictionary['dennis'] = 'Dennis Creevey'

#Book 6
alias_dictionary['avery'] = 'Avery_I'
replace_aliases('./Chapters_withouth_replacement/B6', './B6')

#Book 7
replace_aliases('./Chapters_withouth_replacement/B7', './B7')

# Graph with weights
Making a graph for each book. The nodes are the characters in the book, and edges are between characters that are in the same chapter. Edges have weight corresponding to the number of times those two characters are in the same chapter. Nodes have the attributes parentage, house, occupation and loyalty. 

In [None]:
"""
Function to create a graph with weighted edges. 
For finding the edges we look at a given amount of sentences at a time 
and if two character are mentioned in this text piece they get an edge between them.
The weight is the sum of how often two chracters appear together.

Input: A list of character names with their attributes, 
       the path of the book,
       how many sentences to look at at time
Output: A weighted graph
"""

def weighted_temporal_graphs(character_list, path, sentence_no):
    
    G = nx.Graph()
  
    # Go throug each chapter in the book
    for chapter in os.listdir(path): 
        #only look at the files where aliases have been replaced with character names
        if "replaced" in chapter:   

            # Get text
            with open(path + chapter) as f:
                text = f.read()
            
            # Put all characters from the chapter in the graph if they are not already there
            for character in character_list:
                if character[0] in text and character[0] not in list(G.nodes):
                    G.add_node(character[0], parentage = character[1], 
                               house = character[2], occupation = character[3], loyalty = character[4])
            
            # Split the text in sentences 
            sentences = text.split(". ")
            count_start = 0
            count_end = sentence_no
            
            # Look at specified amount of senteces at a time
            while (count_start < len(sentences)):
                current = sentences[count_start:count_end]
                current = " ".join(current)
                
                # Go through the nodes and check if two diffferent nodes appear in the same text piece
                # if so add an edge
                # weight is the amount of times they appear together throughout the book
                for character_source in list(G.nodes):
                    #print(character_source)
                    for character_target in list(G.nodes):
                        #print(character_target)
                        if character_source is character_target:
                            continue
                        elif (character_source in current and character_target in current):
                            if G.has_edge(character_source, character_target):
                                G[character_source][character_target]['weight'] += 1
                            else:
                                G.add_edge(character_source, character_target, weight=1)
                            #print("added some edge")
                count_start = count_end
                count_end += sentence_no
    # Remove nodes without edges
    print(list(nx.isolates(G)))
    G.remove_nodes_from(list(nx.isolates(G)))
    #print("Done with graph")
    return G

In [None]:
#https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.operators.binary.compose.html#networkx.algorithms.operators.binary.compose

"""
Function to combine two graphs and adding the weights for the edges. 
Input: two graphs to combine
Output: the combined graph, including the weights of the edges added together if same edges
"""

def combine_graphs(g1, g2):
    combined= nx.compose(g1, g2)
    edge_data = {e: g1.edges[e]['weight'] + g2.edges[e]['weight'] 
                 for e in g1.edges & g2.edges}
    nx.set_edge_attributes(combined, edge_data, 'weight')
    
    return combined


In [None]:
""" 
Function to get the weight sum for each node in a graph.
Input: a graph
Output: A list of of the summed weights for the edges for each node,
        this list is ordered as the list of nodes returned from graph.nodes
"""

def get_weight_sums(graph):
    weight_sums = []
    for node in list(graph.nodes):
        sum = 0
        for source, target in list(graph.edges):
            if node is source or node is target:
                sum += graph[source][target]["weight"]
        weight_sums.append(sum)
    return weight_sums

In [None]:
# https://stackoverflow.com/questions/5294955/how-to-scale-down-a-range-of-numbers-with-a-known-min-and-max-value
"""
Function to scale the weight sums for the nodes. 
This is used when drawing the networks so we won't get too small or too big nodes. 
Input: a = minmum value for scaled weights
       b = maximum value for scaled weights
       G = graph
Output: A list with the scaled weights
"""

def scaled_weights(a, b, G):
    weights = get_weight_sums(G)
    max_weight = max(weights)
    min_weight = min(weights)
    scaled = []
    for w in weights:
        scaled.append(((b - a) * (w - min_weight) // (max_weight - min_weight)) + a)
    return scaled

In [None]:
# https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list

"""
Funtion to get the n top and bottom scaled node weights. 
Input: G = graph
       n = Number of nodes from top to bottom
       f = flag, True = top, False = bottom
Output: A list with names of the n nodes
        A list of node sizes as a tuple
        A list of the indices
"""

def get_nodes_extreme(G, n, f):
    if f:
        n_indices = np.argsort(get_weight_sums(G))[-n:]
    else:
        n_indices = np.argsort(get_weight_sums(G))[0:n]
    sc_weights = scaled_weights(50, 800, G)
    names = []
    weights = []
    indices = []
    for i in n_indices:
        names.append(list(G.nodes())[i])
        weights.append(sc_weights[i])
        indices.append(i)
    return (names, weights, indices)

In [None]:
"""
Function to draw a network.
Input: a graph,
       title for the plot, default is empty string
Output: plots the graph using forceAtlas

"""

def draw_network(graph, title=""):
    # Adjusting figure size
    plt.rcParams['figure.figsize'] = [10, 10]

    forceatlas2 = ForceAtlas2(
                            # Behavior alternatives
                            outboundAttractionDistribution=True,  # Dissuade hubs
                            linLogMode=False,  # NOT IMPLEMENTED
                            adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                            edgeWeightInfluence=1.0,

                            # Performance
                            jitterTolerance=1.0,  # Tolerance
                            barnesHutOptimize=True,
                            barnesHutTheta=1.2, # original 1.2
                            multiThreaded=False,  # NOT IMPLEMENTED

                            # Tuning
                            scalingRatio=2.0,
                            strongGravityMode=True,
                            gravity=0.1, # original 0.5

                            # Log
                            verbose=True)

    positions = forceatlas2.forceatlas2_networkx_layout(graph, pos=None, iterations=2000)
    nx.draw_networkx_edges(graph, positions, edge_color="black", alpha=0.1)
    
    # Setting the size of the nodes   
    # Making 3 lists: top n max_weights, bottom n min_weights, rest
    max_nodes, max_sizes, max_indices = get_nodes_extreme(graph, 10, True)
    min_nodes, min_sizes, min_indices = get_nodes_extreme(graph, 10, False)
    
    rest_nodes = [n for n in list(graph.nodes()) if n not in max_nodes and n not in min_nodes]
    rest_sizes = []
    indices_to_remove = max_indices + min_indices
    G_scaled_weights = scaled_weights(50, 800, graph)
    for i in range(len(G_scaled_weights)):
        if i not in indices_to_remove:
            rest_sizes = G_scaled_weights[i]
            
    nx.draw_networkx_nodes(graph, positions, nodelist=rest_nodes, node_color='#efbc2f', node_size=rest_sizes,edgecolors = 'black', alpha=1)
    nx.draw_networkx_nodes(graph, positions, nodelist=min_nodes, node_color='#366447', node_size=min_sizes, edgecolors = 'black', alpha=1)
    nx.draw_networkx_nodes(graph, positions, nodelist=max_nodes, node_color='#a6332e', node_size=max_sizes, edgecolors = 'black', alpha=1)
    
    plt.axis('off')
    plt.title(title)
    plt.figtext(.5, -0.05, f"The size of a note indicates the scaled sum of its weights.", ha="center")
    # Used to save the fig for the paper
    #plt.savefig('networkCombined.png', format='png', transparent=True)
    plt.show()

In [None]:
# Making graphs of book 1 where the no of senteces are changed
graph_list = []
graph_list.append(weighted_temporal_graphs(characters, "B1/", 5))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 10))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 20))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 30))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 40))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 50))

In [None]:
# plotting the networks with different no of sentences to see how big a text piece would be good
sentence_len = [5, 10, 20, 30, 40, 50]

for i, graph in enumerate(graph_list):
    draw_network(graph, "Book one network with interval of "+str(sentence_len[i])+" sentences")

>By looking at the different networks we can see that it quickly becomes very dense with bigger text pieces, so we choose to use 5 sentences for the further graph creation. 

In [None]:
# Making network separately for each book

book_graphs = []
book_graphs.append(weighted_temporal_graphs(characters, "B1/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B2/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B3/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B4/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B5/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B6/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B7/", 5))

In [None]:
# Used to create a figure of network of book 1 to save for the paper
draw_network(book_graphs[0], "Network of book 1")

In [None]:
# Drawing network of each book
for i, graph in enumerate(book_graphs):
    draw_network(graph, "Network of book " + str(i+1))

In [None]:
# Combining the networks of the books
combined_nx = [book_graphs[0]]
combined_nx.append(combine_graphs(combined_nx[0], book_graphs[1]))
combined_nx.append(combine_graphs(combined_nx[1], book_graphs[2]))
combined_nx.append(combine_graphs(combined_nx[2], book_graphs[3]))
combined_nx.append(combine_graphs(combined_nx[3], book_graphs[4]))
combined_nx.append(combine_graphs(combined_nx[4], book_graphs[5]))
combined_nx.append(combine_graphs(combined_nx[5], book_graphs[6]))

In [None]:
# Used to create a figure of the network of all the books combined for the paper
draw_network(combined_nx[6], "Combined network of all books")

In [None]:
# Drawing network of combined books
for i, graph in enumerate(combined_nx):
    title = ''
    if i == 0:
        title = "Network of book 1"
    else:
        title = "Network of book 1-" + str(i+1) 
    draw_network(graph, title)

In [None]:
# Finding the character with biggest increase in percentage of its edges
# Very hacky solution
edge_increase = []
startG = book_graphs[0]
endG = combined_nx[6]
for char in characters:
    edgeStart = len(startG.edges(char[0]))
    if edgeStart == 0:
        edgeStart = len(book_graphs[1].edges(char[0]))
        if edgeStart == 0:
            edgeStart = len(book_graphs[2].edges(char[0]))
            if edgeStart == 0:
                edgeStart = len(book_graphs[3].edges(char[0]))
                if edgeStart == 0:
                    edgeStart = len(book_graphs[4].edges(char[0]))
                    if edgeStart == 0:
                        edgeStart = len(book_graphs[5].edges(char[0]))
                        if edgeStart == 0:
                            edgeStart = len(book_graphs[6].edges(char[0]))
    if edgeStart == 0:
        print(char[0])
        continue
    edgeEnd = len(endG.edges(char[0]))
    increase = edgeEnd-edgeStart
    percentage = increase*100/edgeStart
    edge_increase += [(char[0], edgeStart, edgeEnd, percentage)]

print(max(edge_increase,key=lambda item:item[3]))

In [None]:
# Used to find info for the paper
print(len(book_graphs[0].edges('Harry_Potter')))
print(len(combined_nx[6].edges('Harry_Potter')))
print(len(book_graphs[0].edges('Ronald_Weasley')))
print(len(combined_nx[6].edges('Ronald_Weasley')))
print(len(book_graphs[0].edges('Hermione_Granger')))
print(len(combined_nx[6].edges('Hermione_Granger')))

In [None]:
# Plotting number of edges belong to ten characters throughout the series:
char = ["Harry_Potter", "Ronald_Weasley", "Hermione_Granger", "Albus_Dumbledore", "Severus_Snape", "Tom_Riddle", 
        "Rubeus_Hagrid", "Draco_Malfoy", "Ginevra_Weasley", "Neville_Longbottom"]
font = 15
no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]
plt.rcParams["figure.figsize"] = (10,11)
for c in char:
    edges = []
    for graph in combined_nx:
        edges.append(len(list(graph.edges(c))))
    no_edges.append(edges)

colors = ['#a6332e', '#efbc2f', '#3c4e91', '#366447', '#aaaaaa', '#946b2d', 'orchid', '#d3a625', 'orangered', 'green']
for i, e_list in enumerate(no_edges):
    plt.plot(networks, e_list, label = char[i].replace('_', ' '), color = colors[i])
plt.xlabel("No. of books combined", fontsize = font)
plt.ylabel("No. of edges", fontsize = font)
plt.legend(fontsize = font)
plt.xticks(fontsize=font)
plt.yticks(fontsize=font)
plt.title("Evolution of the network of ten main characters")
plt.savefig('plotMainCharacters.png', format='png', transparent=True)
plt.show()

In [None]:
# Plot of the increase of edges and nodes throughout the books. With different y-axes in same plot. 
# Not used in paper as it was difficult to read
no_nodes = []
no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]
for graph in combined_nx:
    no_nodes.append(graph.number_of_nodes())
    no_edges.append(graph.number_of_edges())

ax1 = plt.subplot()
l1, = ax1.plot(networks, no_nodes, color='red')
ax2 = ax1.twinx()
l2, = ax2.plot(networks, no_edges, color='blue')
ax1.set_xlabel('No. of books combined')
ax1.tick_params(axis="y", labelcolor='red')
ax2.tick_params(axis="y", labelcolor='blue')
ax1.set_ylabel("No. of nodes")
ax2.set_ylabel("No. of edges")

plt.legend([l1, l2], ["No. of nodes", "No. of edges"])

plt.show()

In [None]:
# Plot showing how many nodes there are in each book and for the books combined
no_bookNodes = []
no_combinedNodes = []
font = 20
#no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]
for graph in combined_nx:
    no_combinedNodes.append(graph.number_of_nodes())
for graph in book_graphs:   
    no_bookNodes.append(graph.number_of_nodes())
    
counts, bins = np.histogram(no_combinedNodes, bins = 7)

plt.plot(networks, no_combinedNodes, color = '#efbc2f', alpha = 1, label = 'Combined books')
plt.bar(networks, no_bookNodes, color = '#3c4e91', edgecolor = 'black', alpha = 0.8)
plt.xlabel('Book no.', fontsize=font)
plt.ylabel('Number of nodes', fontsize=font)
plt.legend(fontsize = font)
plt.xticks(fontsize=font)
plt.yticks(fontsize=font)
plt.savefig('plotNodes.png', format='png', transparent=True)
plt.show()

In [None]:
# Plot showing how many edges there are in each book and for the books combined
no_bookEdges = []
no_combinedEdges = []
#no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]

font = 15
for graph in combined_nx:
    no_combinedEdges.append(graph.number_of_edges())
for graph in book_graphs:   
    no_bookEdges.append(graph.number_of_edges())

plt.plot(networks, no_combinedEdges, color = '#366447', alpha = 1, label = 'Combined books')
plt.bar(networks, no_bookEdges, color = '#a6332e', edgecolor = 'black', alpha = 0.8)
plt.xlabel('Book no.', fontsize=font)
plt.ylabel('Number of nodes', fontsize=font)
plt.legend(fontsize = font)
plt.xticks(fontsize=font)
plt.yticks(fontsize=font)
plt.savefig('plotEdges.png', format='png', transparent=True)
plt.show()

# Communities

In [None]:
"""
Function to divide the graph in communities using the Louvain algorithm. 
Input: a graph to divide in communities
Output: A list with the different communities
"""

def communities(graph):
    partition = community_louvain.best_partition(graph)
    #print(partition)
    partition_list = []
    
    for com in set(partition.values()) :
        list_nodes = [nodes for nodes in partition.keys()
                                if partition[nodes] == com]
        partition_list.append(list_nodes)
    partition_list = sorted(partition_list, key=len, reverse=True)
    #print(partition_list)
    return partition_list   

In [None]:
#Make dictionary wordclouds with the character names and their weights
# equal attributes are summarized
# Was used to try and make wordclouds with the names of 
#the characters in the communities and their corresponding attributes
#But this did not end up as we wanted so not used in the paper

"""
Input: a community and the graph it is extracted from
Output: A dictionary of all the characters in the community,
        the parentages, houses and occupations belonging to the characters.
        Each character gets a value according to their sum
        Each parentage, house and accupation is summed up for the total no. of characters belonging
        to that parentage, house or occupation.
"""

def wordcloud_dict(community, graph):
    cloud_freq = {}   
    subG = graph.subgraph(community)
    nodes = list(subG.nodes)
    weights = get_weight_sums(subG)
    parentages = nx.get_node_attributes(subG, 'parentage')
    houses = nx.get_node_attributes(subG, 'house')
    occupations = nx.get_node_attributes(subG, 'occupation')
    
    
    for character in community:
        parentage = parentages[character]
        house = houses[character]
        occupation = occupations[character]
        
        cloud_freq[character.replace('_', ' ')] = weights[nodes.index(character)]
        
        if parentage != 'other':
            if parentage in cloud_freq:
                cloud_freq[parentage] = cloud_freq.get(parentage) + 1
            else:
                cloud_freq[parentage] = 1
        
        if house != 'other':
            if house in cloud_freq:
                cloud_freq[house] = cloud_freq.get(house) + 1
            else:
                cloud_freq[house] = 1
        
        if occupation != 'other':
            if occupation in cloud_freq:
                cloud_freq[occupation] = cloud_freq.get(occupation) + 1
            else:
                cloud_freq[occupation] = 1
    
    return cloud_freq

In [None]:
# Was used to try and make wordclouds with the names of 
#the characters in the communities and their corresponding attributes
#But this did not end up as we wanted so not used in the paper

"""
Input: a list of communities and the graph they're extracted from
Output: a list with a dictionary for each community
"""

def make_com_dicts(com_list, graph):
    com_dicts = []

    for com in com_list:
        com_dicts.append(wordcloud_dict(com, graph))
    
    return com_dicts
    

In [None]:
# Was used to try and make wordclouds with the names of 
#the characters in the communities and their corresponding attributes
#But this did not end up as we wanted so not used in the paper

"""
Input: a list of dictionaries
Output: Wordclouds plotted for the community dictionaries given
"""
def draw_word_cloud(dicts):
    fig = plt.figure()
    plt.rcParams['figure.figsize'] = [15, 20]

    for i in range(len(dicts)):
        ax = fig.add_subplot(5,2,i+1)
        wordcloud = WordCloud(background_color='black', width=2200,
                          height=1800, collocations=False, 
                              # Different colormaps https://matplotlib.org/stable/tutorials/colors/colormaps.html
                          colormap = plt.get_cmap('hsv', max(partition.values()) + 30)
                        ).generate_from_frequencies(dicts[i])

        ax.imshow(wordcloud)
        ax.axis('off')
    plt.show()

In [None]:
# Communitites from all books
all_communities = communities(combined_nx[6])

In [None]:
# communities in each book:

book_communities = []

for graph in book_graphs:
    book_communities.append(communities(graph))

In [None]:
# print which book, the communities and the size of each community

for i, com in enumerate(book_communities):
    print('Book no.: ' + str(i+1))
    print(com)
    for part in com:
        print(len(part))

In [None]:
# All those are not used. Was to see how it would turn out with using the names and attributes

# Wordclouds for all books combined
dictionary = make_com_dicts(all_communities, combined_nx[6])
draw_word_cloud(dictionary)

#Wordclouds for book 1:
dictionary1 = make_com_dicts(book_communities[0], book_graphs[0])
draw_word_cloud(dictionary1)

#Wordclouds for book 2:
dictionary2 = make_com_dicts(book_communities[1], book_graphs[1])
draw_word_cloud(dictionary2)

#Wordclouds for book 3:
dictionary3 = make_com_dicts(book_communities[2], book_graphs[2])
draw_word_cloud(dictionary3)

#Wordclouds for book 4:
dictionary4 = make_com_dicts(book_communities[3], book_graphs[3])
draw_word_cloud(dictionary4)

#Wordclouds for book 5:
dictionary5 = make_com_dicts(book_communities[4], book_graphs[4])
draw_word_cloud(dictionary5)

#Wordclouds for book 6:
dictionary6 = make_com_dicts(book_communities[5], book_graphs[5])
draw_word_cloud(dictionary6)

#Wordclouds for book 7:
dictionary7 = make_com_dicts(book_communities[6], book_graphs[6])
draw_word_cloud(dictionary7)

### Extracting texts from books belonging to communities

In [None]:
"""
Function to extract the text belonging to a community. 
We look in the book the community belongs to, and then for each character 
find the text pieces where they are mentioned and saves this text piece. 
Input: A list of communities.
       the directory of the book,
       how many sentences to read at a time
Output: A list with the strings belonging to each community
"""


def extract_com_texts(com_list, book_dir, sentence_no):
    community_texts = []
    maxrange = 0
    #if we have less than 10 communities
    if len(com_list) < 10:
        maxrange = len(com_list)
    else:
        maxrange = 10


    for community in com_list[:maxrange]:
        
        com_txt = []        
        for chap in os.listdir(book_dir):
            #if chap == 'replaced_ch1.txt':
            with open(book_dir + chap) as f:
                    text = f.read()

            sentences = text.split(". ")
            count_start = 0
            count_end = sentence_no
            add_text_start = []
            
            while (count_start < len(sentences)):
                    current = sentences[count_start:count_end]
                    current = " ".join(current)
                    for char in community:
                        if char in current:
                            # If we haven't already added the textpiece to this community, then add it
                            # To make sure that we won't get the same textpiece several times in one community
                            if not count_start in add_text_start:
                                tokens = word_tokenize(current)
                                com_txt = com_txt + tokens
                                add_text_start.append(count_start)                                      
                    count_start = count_end
                    count_end += sentence_no
                    
        community_texts.append(com_txt)

    stopwords = nltk.corpus.stopwords.words('english')

    community_strings = []
    for txt in community_texts:
        com_words = [w for w in txt if w not in stopwords and len(w)>1]
        community_strings.append(com_words)

    return community_strings
    

In [None]:
# Find the strings for the communities in each book
com_strings1 = extract_com_texts(book_communities[0], './B1/', 5)
com_strings2 = extract_com_texts(book_communities[1], './B2/', 5)
com_strings3 = extract_com_texts(book_communities[2], './B3/', 5)
com_strings4 = extract_com_texts(book_communities[3], './B4/', 5)
com_strings5 = extract_com_texts(book_communities[4], './B5/', 5)
com_strings6 = extract_com_texts(book_communities[5], './B6/', 5)
com_strings7 = extract_com_texts(book_communities[6], './B7/', 5)

### Extract wikitext for each community

In [None]:
# Function used to find the wikitexts belonging to each character in a community 
# Not used as we went with the text from the books instead
def extract_com_wikitexts(com_list, directory):
    community_texts = []
    maxrange = 0
    #if we have less than 10 communities
    if len(com_list) < 10:
        maxrange = len(com_list)
    else:
        maxrange = 10


    for community in com_list[:maxrange]:
        
        com_txt = []  
        
        for char in community:
            with open(directory + 'clean_' + char + '.txt') as f:
                    text = f.read()
            
            tokens = nltk.word_tokenize(BeautifulSoup(text, 'html.parser').get_text())
            #tokens = word_tokenize(current)
            #file_text = [w.lower() for w in tokens if w.isalpha()]
            com_txt = com_txt + tokens
            
        community_texts.append(com_txt)

    stopwords = nltk.corpus.stopwords.words('english')

    community_strings = []
    for txt in community_texts:
        com_words = [w for w in txt if w.lower() not in stopwords and len(w)>2]
        community_strings.append(com_words)

    return community_strings

In [None]:
com_wikistrings1 = extract_com_wikitexts(book_communities[0], './characters/')

In [None]:
"""
Funtion to make a list of each unique word for the communities
Input: List of community strings
Output: A list with the unique terms for each community
"""
def unique(com_str):
    unique_terms = []
    for community_words in com_str:
        unique_terms.append(list(set(community_words)))
    
    return unique_terms

In [None]:
unique_terms1 = unique(com_strings1)
unique_terms2 = unique(com_strings2)
unique_terms3 = unique(com_strings3)
unique_terms4 = unique(com_strings4)
unique_terms5 = unique(com_strings5)
unique_terms6 = unique(com_strings6)
unique_terms7 = unique(com_strings7)

In [None]:
unique_wikiterms1 = unique(com_wikistrings1)

# TF-IDF

In [None]:
"""
A function to calculate the IDF value for a word
Input: A word,
       a list with the uniqe term for each community
Output: The IDF value found for the word
"""

def idf(word, unique_list):
    N = len(unique_list)
    term_appears = 0
    for sublist in unique_list:
        if word in sublist:
            term_appears+=1
    idf_val = math.log(N/(1+term_appears))+1
    return idf_val

In [None]:
"""
Function to calculate the TF-IDF value for each word belonging to a community
Input: A list with the community string,
       a list with the uniqe terms for each community
Output: A list containg the words and their TF-IDF values for each community
"""

def tfidf(community_str, unique_words):
    tfidf_list = []

    for community_words in community_str:
        fdist = FreqDist(community_words)
        total_terms = len(community_words)
        tfidf=[]
        for word in fdist:
            idf_val = idf(word, unique_words)
            tf_val = fdist[word]/total_terms
            tfidf_elem=(word, tf_val*idf_val)
            tfidf.append(tfidf_elem)
        tfidf_list.append(tfidf)
        
    return tfidf_list

In [None]:
# Creating the tfidf list for each book
tfidf_list1 = tfidf(com_strings1, unique_terms1)
tfidf_list2 = tfidf(com_strings2, unique_terms2)
tfidf_list3 = tfidf(com_strings3, unique_terms3)
tfidf_list4 = tfidf(com_strings4, unique_terms4)
tfidf_list5 = tfidf(com_strings5, unique_terms5)
tfidf_list6 = tfidf(com_strings6, unique_terms6)
tfidf_list7 = tfidf(com_strings7, unique_terms7)

In [None]:
tfidf_wikilist1 = tfidf(com_wikistrings1, unique_wikiterms1)

## Wordclouds for communities with book text

In [None]:
"""
Function to create and plot the word clouds for the communities
Input: A list with the words and their TF-IDF values for each community
Output: A wordcloud plot for each community with their corresponding words
"""

def wordCloud(tfidf_list):
    fig = plt.figure()
    plt.rcParams['figure.figsize'] = [15, 20]

    for i in range(len(tfidf_list)):
        ax = fig.add_subplot(5,2,i+1)
        wordcloud = WordCloud(background_color='white', width=2200,
                          height=1800, collocations=False).generate_from_frequencies(dict(tfidf_list[i]))

        ax.imshow(wordcloud)
        ax.axis('off')
    plt.show()

In [None]:
wordCloud(tfidf_list1)

wordCloud(tfidf_list2)

wordCloud(tfidf_list3)

wordCloud(tfidf_list4)

wordCloud(tfidf_list5)

wordCloud(tfidf_list6)

wordCloud(tfidf_list7)

## Wordclouds for communities with wiki text

In [None]:
wordCloud(tfidf_wikilist1)

# Sentiment Analysis
>We want to be able to analyse the sentiment of a character throughout the books. Our idea for achieving this is to use concordance from nltk with the character name, this way we will get all of the context surrounding a character. For each of these occurrences we can compute the sentiment for the context, and use that sentiment as a representative for the character. 

In [None]:
# Vriables to be used for the sentiment analysis
average_length_of_sentence = 114
sentences_for_con = 3

## Sentiment calculations from LabMT1.0 vs VADER-Sentiment
>We have considered two options for calculating our sentiment. Either we could use the the LabMT1.0 data set to find the sentiment of a portion of text by assigning each word in that text a value based on LabMT1.0, and then taking the average of those words. Or we could use <a href="https://github.com/cjhutto/vaderSentiment/blob/master/README.rst">VADER-Sentiment</a>. We wanted to experiment with the VADER solution, since our initial findings for sentiment using LabMT1.0 had very similar values around 5.5. To experiment we have made a graph of sentiments for each chapter of Book 7, for Harry Potter, Voldemort, and Snape for both methods:

###  Sentiment for concordance of character, LabMT1.0
> First we create a list of tuples containing each word and the average happiness for that word. This allows us to go through a portion of text and look up the average happiness for each word.

In [None]:
# Create an empty list to store tuples of words and their 
# average happiness score
sent_list_labmt10 = []

# Read in the .tsv file
with open("LabMT1.0.tsv", "r", encoding="utf8") as sent_file:
    tsv_reader = csv.DictReader(sent_file, delimiter="\t")
    # For each sentiment in the file, save the word and average happiness in a tuple
    # and add it to the list
    for sent in tsv_reader:
        word = sent["word"]
        average = sent["happiness_average"]
        sent_list_labmt10.append((word, average))

>We now define a function to calculate the average sentiment for a set of tokens:

In [None]:
# Defining function to calculate sentiment for a list of tokens
# Function for calculating the sentiment of a file from the frequency distribution for that file
def sentiment_labmt10(tokens):
    # Total sentiment score of file
    sent_sum_labmt10 = 0.0
    # Total number of occurences of words
    occ_sum_labmt10 = 0
    
    # For each token and associated number of occurences
    for token, occ in tokens.items():
        # If the token is in the given list of words with rated happiness
        for word, score in sent_list_labmt10:
            if token == word:
                sent_sum_labmt10 += (float(score) * occ)
                occ_sum_labmt10 += occ
    return sent_sum_labmt10 / occ_sum_labmt10


>And a function to calculate the sentiment for a concordance list found by nltk, and define a list of stopwords to be filtered out:

In [None]:
stopwords_labmt10 = nltk.corpus.stopwords.words('english')

In [None]:
# Finding sentiment from a concordance list
def con_sentiment_labmt10(con_list):
    sent_sum = 0
    line_num = 0
    for item in con_list:
        left = [ch.lower() for ch in item.left if ch.isalpha()]
        right = [ch.lower() for ch in item.right if ch.isalpha()]
        # Make left and right into one list and remove stopwords
        combined = [w for w in (left + right) if w not in stopwords_labmt10]
        
        # Make frequency distribution 
        fd = nltk.FreqDist(combined)
        sent_sum += sentiment_labmt10(fd)
        line_num += 1
    if line_num == 0:
        return None # Character had no appearences in chapter
    return sent_sum / line_num

>We then run our test as described previously:

In [None]:
# Define character list with the three characteres
char_list_labmt10 = ["Harry_Potter", "Severus_Snape", "Tom_Riddle"] 

# Create a list of the chapters of book 7
chapters_labmt10 = os.listdir("B7")

# Init a list to tuples (chraracter, [sentiments for each chapter for that character])
sentiment_by_character_labmt10 = []

# For each of the characters
for character in char_list_labmt10:
    sentiments = []
    # For each chapter
    for chapter in chapters_labmt10:
        # Read in and tokenize the chapter
        if "replaced" in chapter:
            with open("B7/" + chapter) as f:
                    raw = f.read()
            tokens = nltk.word_tokenize(raw)
            text = nltk.Text(tokens)
        
            # Make concordance for that character
            con = text.concordance_list(character, width = sentences_for_con * average_length_of_sentence)
        
            # Calculate sentiments and append to the list for that character
            sentiments.append(con_sentiment_labmt10(con))
    # Append the character with its full sentiment list
    sentiment_by_character_labmt10.append((character, sentiments))

# https://stackoverflow.com/questions/4971269/how-to-pick-a-new-color-for-each-plotted-line-within-a-figure-in-matplotlib
color = iter(cm.rainbow(np.linspace(0, 1, len(char_list_labmt10))))

for name, sentiments in sentiment_by_character_labmt10:
    c = next(color)
    plt.plot(sentiments, c=c, label=name)

plt.legend()
plt.xlabel('Chapter')
plt.ylabel('Average sentiment')
plt.title('Average sentiment by chapter for selected characters')
plt.figtext(.5, -0.1, f"Plot of the average sentiment for Harry, Snape, and Voldemort in book 7 when calculating sentiment from LabMT1.0.", ha="center")

plt_labmt = plt

### Sentiment for concordance of character, vaderSentiment
>For VADER we use the same approach and code, but we have to redefine how we calculate sentiment from concordance, since we just have to pass a string to the analyzer:

In [None]:
# Finding sentiment from a concordance list
def con_sentiment(con):
    sent_sum = 0
    line_num = 0
    combined = con.left + con.right
    combined = " ".join(combined)

    vs = analyzer.polarity_scores(combined)

    if vs == 0:
        return None # Character had no appearences in chapter
    return vs["compound"]

In [None]:
# Define character list with the three characteres
char_list_vader = ["Harry_Potter", "Severus_Snape", "Tom_Riddle"] 

# Create a list of the chapters of book 7
chapters_vader = os.listdir("B7")

# Init a list to tuples (chraracter, [sentiments for each chapter for that character])
sentiment_by_character_vader  = []

# For each of the characters
for character in char_list_vader :
    sentiments = []
    for chapter in chapters_vader :
        if "replaced" in chapter:
            with open("B7/" + chapter) as f:
                    raw = f.read()
            tokens = nltk.word_tokenize(raw)
            text = nltk.Text(tokens)
            cons = text.concordance_list(character, width = sentences_for_con * average_length_of_sentence)
            sent_sum = 0
            lines = 0
            for con in cons:
                sent_sum += con_sentiment(con)
                lines += 1
            if lines == 0:
                sentiments.append(None)
            else: 
                sentiments.append(sent_sum / lines)
    sentiment_by_character_vader .append((character, sentiments))

# https://stackoverflow.com/questions/4971269/how-to-pick-a-new-color-for-each-plotted-line-within-a-figure-in-matplotlib
color_vader  = iter(cm.rainbow(np.linspace(0, 1, len(char_list_vader ))))

for name, sentiments in sentiment_by_character_vader :
    c = next(color_vader)
    plt.plot(sentiments, c=c, label=name)

plt.axhline(y = 0.05, color ="purple", linestyle = '--', label="Neutral region")
plt.axhline(y = -0.05, color ="purple", linestyle = '--')

plt.legend()
plt.xlabel('Chapter')
plt.ylabel('Average sentiment')
plt.title('Average sentiment by chapter for selected characters')
plt.figtext(.5, -0.1, f"Plot of the average sentiment for Harry, Snape, and Voldemort in book 7 when calculating sentiment with VADER.", ha="center")

plt_vader = plt

### Conclusion: LabMT1.0 vs vaderSentiment
>Both results are similar, but we see an advantage in using VADER when we consider the sentiment for Harry Potter. Looking at the graphs we can see that there are similar trends for Harry throughout the book, but with VADER the sentiment becomes more consistent, in that it appears to be in the neutral region at some points, and then go out of it. On the other hand the MatLab1.0 seems to indicate that Harry is well above 5.1 throughout the book, which we would consider to be above neutral. Based on this preliminary test we believe that we will get a more clear picture from VADER.

## Functions for calculating sentiment
>The following section contains various functions we have defined to find and plot sentiment for various cases.

### ```sent_chars_book(char_list, path_to_book)```
>The function takes a list of character names and a path to a book. It computes the average sentiment of each character in the list throughout the book on a chapter basis. This can be used to find out how the sentiment of a single character changes throughout a book or a group of characters such as a house.

In [None]:
"""
Input: A list of character names to look for as tokens
        and a path to the chapters of the book
Output: A list of tuples, with the character name and a list
        of the sentiments for each chapter for that character
"""
def sent_chars_book_list(char_list, book_list, sentences):
    # Init list to hold tuples
    sentiment_of_character = [(n, []) for n in char_list]
    #print(sentiment_of_character)
    #print(len(sentiment_of_character))
    #print(char_list)
    
    chapter_counter = 0
    for book in book_list:
        
        # For each chapter
        for chapter in os.listdir(book):
            if "replaced" in chapter:
                # Read in the chapter and tokenize
                with open(book + chapter) as f:
                    raw = f.read()
                tokens = nltk.word_tokenize(raw)
                text = nltk.Text(tokens)

                character_counter = 0
                # For each character in the given list
                for character in char_list:
                    # Make concordance for that character in that chapter
                    cons = text.concordance_list(character, width = sentences * average_length_of_sentence)
                    #print(character)
                    #print(f"character = {character}, sentiment_of_character[{character_counter}] = {sentiment_of_character[character_counter]}")

                    sent_sum = 0
                    lines = 0
                    # For each concordance line
                    for con in cons:
                        # Calculate the sentiment for that concordance line
                        sent_sum += con_sentiment(con)
                        lines += 1
                    if lines == 0:
                        # If there were no lines, the character did not appear
                        sentiment_of_character[character_counter][1].append(None)
                    else: 
                        sentiment_of_character[character_counter][1].append(sent_sum / lines)
                    if character_counter > len(sentiment_of_character):
                        print("!!!!!!!!!!!!!!!!!!!!!!!! WRONG")
                    character_counter += 1
    # Returns a list of the sentiments for that character for each chapter of that book
    return sentiment_of_character

### ```sent_book(path_to_book)```
>Calculates the sentiment for a book on chapter basis. In this function each chapter of a book is read in and the sentiment for the text is calculated. This allows us to see how the sentiment for a book changes as it progresses.

In [None]:
"""
Input: The path to a book.
Output: A list of sentiments for each chapter of the book.
"""
def sent_book(path_to_book):
    chapters = os.listdir(path_to_book)

    # Making a data table (char_list)x(num_chapters) to hold sentiments for each char in each chap
    # tuple list with tuples (character, [sent chapter1, sent chapter2, ...])
    sentiments_by_chapter = []
    
    for chapter in chapters:
        if "replaced" in chapter:
            with open(path_to_book + chapter) as f:
                raw = f.read()
            sentiments_by_chapter.append(analyzer.polarity_scores(raw)["compound"])
    # Returns a list of the sentiments for that character for each chapter of that book
    return sentiments_by_chapter

### ```sent_group(group, label, book_list)```
>This function calculates the average sentiments for all of the names given in ```group``` and returns it as as the sentiment for the name from ```label_group```. The sentiments are calculated from the books given in ```book_list```. e.g. given the list of names of Gryffindor students, with label "Gryffindor" returns a tuple ("Gryffindor", sentiment list), where the sentiment list contains the average sentiment for each chapter for those students.

In [None]:
def avgsent_group(group, label, book_list, sentences):
    # Count number of chapters
    num_chapters = 0
    for book in book_list:
        for chapter in os.listdir(book):
            if "replaced" in chapter:
                num_chapters += 1
   
    sent_chapters = [0] * num_chapters
    sent_group = (label, sent_chapters)
    
    # Counter for current chapter
    c = 0
    
    # For each book in the list
    for book in book_list:  
        # For each chapter going by numbering
        for chapter in os.listdir(book):
            if "replaced" in chapter:
                # Init counter for counting occurences in chapter for average
                occurences = 0

                # Read in the chapter
                with open(book + chapter) as f:
                        raw = f.read()
                tokens = nltk.word_tokenize(raw)
                # Prepare nltk text
                text = nltk.Text(tokens)

                # For each member of the group
                for member in group:
                    # Make concordance for that member for that chapter
                    con_list = text.concordance_list(member, width = sentences * average_length_of_sentence)

                    # For each concordance line in the list
                    for con in con_list:
                        # Calculate the sentiment
                        sent = con_sentiment(con)
                        if sent != 0:
                            # Sum up the sentiment for that chapter for that member
                            # with sentiments for all other members of group
                            #print(f"c = {c}")
                            sent_group[1][c] += sent
                            occurences += 1
                # Divide by the total number of occurences 
                if occurences == 0:
                    sent_group[1][c] = None
                else:
                    sent_group[1][c] = sent_group[1][c] / occurences
                c += 1
    return sent_group                    

### ```plot_sentiments(sentiment_by_character, figure_text, xs_vertical_lines, show_legend)```
>The functions takes a list of tuples, where each tuple contains a name and a list of sentiments. A figure text, a list of tuples for placing vertical lines with labels, and a ```True```/```False```flag for show lgend. This function may take the result of ```sent_char_books``` a sinput for the list of tuples with names and sentiments. This allows for fast and simple plotting.

In [None]:
"""
Input: A list of tuples containing a name and a sentiment list, a figure text, 
       a list of tuples with labels and coordinates for vertical lines, and a
       true/false value for showing labels.
Output: void, shows a plot
"""
def plot_sentiments(sentiment_by_character, figure_text, xs_vertical_lines, show_legend):
    # Init iterator
    color = iter(cm.rainbow(np.linspace(0, 3, len(sentiment_by_character)*3)))
    plt.xlim(0, len(sentiment_by_character[0][1]))
    
    for name, sentiments in sentiment_by_character:
        # Try to give colors according the name associated with the sentiments
        if ("Gryffindor" in name):
            c = "red"
            a = 1
        elif ("Slytherin" in name and name != "Salazar_Slytherin"):
            c = "green"
            a = 1
        elif ("Hufflepuff" in name):
            c = "yellow"
            a = 1
        elif ("Ravenclaw" in name):
            c = "blue"
            a = 1
        elif ("Average" in name):
            c = "black"
            a = 1
        else:
            c = next(color)
            a = 0.5
        plt.plot(sentiments, c=c, alpha=a, label=name)
    
    # Make horizontal lines to indicate the neutral region
    plt.axhline(y = 0.05, color ="purple", linestyle = '--', label="Neutral")
    plt.axhline(y = -0.05, color ="purple", linestyle = '--')
    
    # If vertical lines for book has been specified insert them
    for book, label, vertical_line in xs_vertical_lines:
        plt.axvline(x = vertical_line, color = 'black')
        
    # Get list of labels and vertical_lines
    if xs_vertical_lines != []:
        xs = []
        labels = []
        for book, x_coordinate, label in xs_vertical_lines:
            xs.append(x_coordinate)
            labels.append(label)
        plt.xticks(labels, xs ,rotation=45)

    # If legend has been requested
    if show_legend:
        plt.legend()

    #plt.xlabel('Chapter')
    plt.ylabel('Average sentiment')
    plt.figtext(.5, -0.2, figure_text, ha="center")

    plt.show()

## Sentiment for books
>First we want to explore how the sentiment is throughout the books. We have two ideas for measuring this: Take each chapter as a text and have VADER analyze the sentiment of that text for us. Or for each character from our list of characters, make concordance for each chapter for that character, and divide it by the total number of concordance lines for that chapter. We are going to test these two methods out to see which is more expressive:

>We start by defining some list needed for plotting and reading in all the books:

In [None]:
# Tuple list containing the first chapter of the next book, e.g. ("B1", 18), indicates
# all chapters up to 18 excluded are from book 1
book_list_wchapter = [("B1", "Philosopher's Stone", 0), 
                      ("B2", "Chamber of Secrets", 18),
                      ("B3", "Prisoner of Azkaban", 35), 
                      ("B4", "Goblet of Fire", 57), 
                      ("B5", "Order of the Phoenix", 94), 
                      ("B6", "Half-Blood Prince", 134), 
                      ("B7", "Deathly Hallows", 162)]
# Lists of paths to the folders holding the text from the chapters
book_list = ["B1/", "B2/", "B3/", "B4/", "B5/", "B6/", "B7/"]

In [None]:
# Init list to hold sentiment values
series_sent = []
# Compute and append the sentiment values for the seven books
series_sent += sent_book("B1/")
series_sent += sent_book("B2/")
series_sent += sent_book("B3/")
series_sent += sent_book("B4/")
series_sent += sent_book("B5/")
series_sent += sent_book("B6/")
series_sent += sent_book("B7/")

In [None]:
# Figure text
sent_by_chapter = "Sentiment by chapter for entire series analyzed one chapter at a time"
# Adjust for a wider figure size
plt.rcParams['figure.figsize'] = [15, 5]
# Plot the sentiment values for all of the books when VADER analyzed each chapter as a whole text
plot_sentiments([("Sentiment for series", series_sent)], sent_by_chapter, book_list_wchapter, False)

>The above figure does not convey changes in the book very well, at most it seems that we can get an idea of the overall tone of the chapter, but not how it relates to the other chapters or how the story evolves and changes. 

>For the next part we are going to try the approach with making concordance for all of the characters for each chapter and taking the average. However ```concordance_list``` uses a default width of 80 characters around the string that it is making concordance for. First we have to investigate if this default value is good. To investigate this we first determine the average length in characters for the books:

In [None]:
# What the average length of a sentence in harry potter? in characters
# Read in all of the chapters and count the length of the sentences, divide by the number ound
# use this for average to determine how many sentences we want included for each concordance?
number_of_sentences = 0
number_of_characters = 0
for book in book_list:
    for chapter in os.listdir(book):
        if "replaced" in chapter:
            with open(book + chapter) as f:
                raw = f.read()
            sentences = nltk.sent_tokenize(raw)
            for sentence in sentences:
                number_of_characters += len(sentence)
                number_of_sentences += 1
average_characters_in_sent = number_of_characters / number_of_sentences
print(average_characters_in_sent)

>The average sentence in the books is 114 characters long. Our default width for a concordance should then be 114 characters, assuming the character name is in the middle of the sentence and has an even length. So now we can test out how many sentences should be used. We are going to test with a smaller size of a half sentence, default, one sentence, two, three, and four to start with:

In [None]:
con_sentences_test = []
average_length_of_sentence = 79
con_sentences_test.append(avgsent_group(["Harry_Potter"], "Harry Potter Default", ["B1/"], 1))
average_length_of_sentence = 114
con_sentences_test.append(avgsent_group(["Harry_Potter"], "Harry Potter 1", ["B1/"], 1))
con_sentences_test.append(avgsent_group(["Harry_Potter"], "Harry Potter 2", ["B1/"], 2))
con_sentences_test.append(avgsent_group(["Harry_Potter"], "Harry Potter 3", ["B1/"], 3))
con_sentences_test.append(avgsent_group(["Harry_Potter"], "Harry Potter 4", ["B1/"], 4))
plot_sentiments(con_sentences_test, "Average sentiment for Harry Potter in book one, with 0.5 sentences used for concordance", [], True)

In [None]:
con_sentences_test = []
con_sentences_test.append(avgsent_group(["Harry_Potter"], "Harry Potter 8", ["B1/"], 8))
con_sentences_test.append(avgsent_group(["Harry_Potter"], "Harry Potter 16", ["B1/"], 16))
con_sentences_test.append(avgsent_group(["Harry_Potter"], "Harry Potter 32", ["B1/"], 32))
con_sentences_test.append(avgsent_group(["Harry_Potter"], "Harry Potter 64", ["B1/"], 64))
plot_sentiments(con_sentences_test, "Average sentiment for Harry Potter in book one, with differening number of sentences used for concordance", [], True)

In [None]:
# Extract the character names from our list with attributes
character_names = [n for n, b, h, o in characters]

In [None]:
con_test_all = []
con_test_all.append(avgsent_group(character_names, "All characters, 2 sentences con", ["B1/"], 2))
con_test_all.append(avgsent_group(character_names, "All characters, 3 sentences con", ["B1/"], 3))
con_test_all.append(avgsent_group(character_names, "All characters, 4 sentences con", ["B1/"], 4))
con_test_all.append(avgsent_group(character_names, "All characters, 32 sentences con", ["B1/"], 32))
con_test_all.append(("book 1 sent", sent_book("B1/")))

In [None]:
plot_sentiments(con_test_all, sent_by_chapter, [], True)

In [None]:
con_test_all

In [None]:
# Calculate the average sentiment for each chapter
avgsent_all = avgsent_group(character_names, "Average sentiment of all characters", book_list, sentences_for_con)

In [None]:
sentiment_books = "Average sentiment from concordance of character names throughout the books."
plot_sentiments([avgsent_all], sentiment_books, book_list_wchapter, True)

## Sentiment for houses

In [None]:
gryffindors = [n for n, p, h, o in characters if h == "Gryffindor"]
hufflepuffs = [n for n, p, h, o in characters if h == "Hufflepuff"]
ravenclaws = [n for n, p, h, o in characters if h == "Ravenclaw"]
slytherins = [n for n, p, h, o in characters if h == "Slytherin"]

print(f"Number of characters from Gryffindor: {len(gryffindors)}")
print(f"Number of characters from Hufflepuff: {len(hufflepuffs)}")
print(f"Number of characters from Ravenclaw: {len(ravenclaws)}")
print(f"Number of characters from slytherin: {len(slytherins)}")

In [None]:
avgsent_gryffindor = avgsent_group(gryffindors, "Gryffindors", book_list, sentences_for_con)
avgsent_hufflepuff = avgsent_group(hufflepuffs, "Hufflepuffs", book_list, sentences_for_con)
avgsent_ravenclaw = avgsent_group(ravenclaws, "Ravencalws", book_list, sentences_for_con)
avgsent_slytherin = avgsent_group(slytherins, "Slytherins", book_list, sentences_for_con)

In [None]:
avg_sent_list = [avgsent_gryffindor, 
                 avgsent_slytherin,
                 avgsent_hufflepuff,
                 avgsent_ravenclaw]
plot_sentiments(avg_sent_list, "Average sentiment for the four houses throughout the books", book_list_wchapter, True)

In [None]:
label = "Gryffindors"
gryff_sents = sent_chars_book_list(gryffindors, book_list, sentences_for_con) + [avgsent_gryffindor]

text_gryff_sent = "Average sentiments for Gryffindor, and sentimens for all Gryffindors"
plot_sentiments(gryff_sents, text_gryff_sent, book_list_wchapter, False)

In [None]:
label = "Slytherins"
slyth_sents = sent_chars_book_list(slytherins, book_list, sentences_for_con) + [avgsent_slytherin]

text_slyth_sent = "Average sentiments for Slytherin, and sentimens for all Slytherins"
plot_sentiments(slyth_sents, text_slyth_sent, book_list_wchapter, False)

In [None]:
label = "Hufflepuffs"
huff_sents = sent_chars_book_list(hufflepuffs, book_list, sentences_for_con) + [avgsent_hufflepuff]

# This plot have some None values so will be holes in the plot
text_huff_sent = "Average sentiments for Hufflepuff, and sentimens for all Hufflepuffs"
plot_sentiments(huff_sents, text_huff_sent, book_list_wchapter, False)

In [None]:
label = "Ravenclaws"
rave_sents = sent_chars_book_list(ravenclaws, book_list, sentences_for_con) + [avgsent_ravenclaw]

# This plot have some None values so will be holes in the plot
text_rave_sent = "Average sentiments for Ravenclaw, and sentimens for all Ravenclaws"
plot_sentiments(rave_sents, text_rave_sent, book_list_wchapter, False)

## Highest and lowest sentiments
>Investigating which characters have the highest and lowest sentiments by summing up their average sentiment values for each chapter they appeared in, and dividing by the number of chapters they appeared in into a sentiment score for that character.

In [None]:
avgsent_all_individual = sent_chars_book_list(character_names, book_list, sentences_for_con)
character_sent_scores = []
for name, sent_list in avgsent_all_individual:
    sent_score = 0
    sent_sum = 0
    chapter_occurences = 0
    for sent in sent_list:
        if sent != None:
            sent_sum += sent
            lines += 1
    sent_score = sent_sum
    character_sent_scores.append((name, sent_score))

In [None]:
# https://bobbyhadz.com/blog/python-sort-list-of-tuples-by-second-element
sorted_list = sorted(
    character_sent_scores,
    key=lambda t: t[1]
)
top_names = [n for n, s in sorted_list[-5:]]
top = [(n, l) for n, l in avgsent_all_individual if n in top_names]
bottom_names = [n for n, s in sorted_list[0:5]]
bottom = [(n, l) for n, l in avgsent_all_individual if n in bottom_names]

In [None]:
top_text = "Sentiment values of 5 highest sentiment, with total average for comparison."
plot_sentiments([avgsent_all] + top, top_text, book_list_wchapter, True)

In [None]:
fred_george = [(n, l) for n, l in avgsent_all_individual if n in ["Fred_Weasley", "George_Weasley"]]
fred_george_text = "Sentiment for Fred and George Weasley"
plot_sentiments(fred_george, fred_george_text, book_list_wchapter, True)

In [None]:
bottom_text = "Sentiment values of 5 lowest sentiment, with total average for comparison."
plot_sentiments([avgsent_all] + bottom, bottom_text, book_list_wchapter, True)

In [None]:
sirius_black = [(n, l) for n, l in avgsent_all_individual if n in ["Sirius_Black"]]
sirius_black_text = "Sentiment for Sirius Black"
plot_sentiments(sirius_black, sirius_black_text, book_list_wchapter, True)

## Sentiment for main characters
>Sentiment for the 3 main characters 

In [None]:
main_character_list = ["Harry_Potter", "Ronald_Weasley", "Hermione_Granger"]
main_characters = [(n, l) for n, l in avgsent_all_individual if n in main_character_list]
bottom_text = "Sentiment values of 5 lowest sentiment, with total average for comparison."
plot_sentiments([avgsent_all] + main_characters, bottom_text, book_list_wchapter, True)