In [None]:
import os
import nltk
import csv
from matplotlib.pyplot import cm
import numpy as np
from nltk.corpus import stopwords
import math

import matplotlib.pyplot as plt


from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud

from bs4 import BeautifulSoup
from community import community_louvain
from fa2 import ForceAtlas2

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Sentiment Analysis
>We want to be able to analyse the sentiment of a character throughout the books. Our idea for achieving this is to use concordance from nltk with the character name, this way we will get all of the context surrounding a character. For each of these occurrences we can compute the sentiment for the context, and use that sentiment as a representative for the character. 

In [None]:
# Does not belong here, maybe usufull for arguing about normalizing harry potter weights in graphs?
tokens = []
for chapter in os.listdir("B7/"):
    if "replaced" in chapter:
        f = open("B7/" + chapter)
        raw = f.read()
        tokens += nltk.wordpunct_tokenize(raw)
text = nltk.Text(tokens)

text.dispersion_plot(["Harry_Potter", "Tom_Riddle", "Severus_Snape", "Luna_Lovegood", "Neville_Longbottom", "Nagini"])

## Sentiment calculations from LabMT1.0 vs VADER-Sentiment
>We have considered two options for calculating our sentiment. Either we could use the the LabMT1.0 data set to find the sentiment of a portion of text by assigning each word in that text a value based on LabMT1.0, and then taking the average of those words. Or we could use <a href="https://github.com/cjhutto/vaderSentiment/blob/master/README.rst">VADER-Sentiment</a>. We wanted to experiment with the VADER solution, since our initial findings for sentiment using LabMT1.0 had very similar values around 5.5. To experiment we have made a graph of sentiments for each chapter of Book 7, for Harry Potter, Voldemort, and Snape for both methods:

###  Sentiment for concordance of character, LabMT1.0
> First we create a list of tuples containing each word and the average happiness for that word. This allows us to go through a portion of text and look up the average happiness for each word.

In [None]:
# Create an empty list to store tuples of words and their 
# average happiness score
sent_list_labmt10 = []

# Read in the .tsv file
with open("LabMT1.0.tsv", "r", encoding="utf8") as sent_file:
    tsv_reader = csv.DictReader(sent_file, delimiter="\t")
    # For each sentiment in the file, save the word and average happiness in a tuple
    # and add it to the list
    for sent in tsv_reader:
        word = sent["word"]
        average = sent["happiness_average"]
        sent_list_labmt10.append((word, average))

>We now define a function to calculate the average sentiment for a set of tokens:

In [None]:
# Defining function to calculate sentiment for a list of tokens
# Function for calculating the sentiment of a file from the frequency distribution for that file
def sentiment_labmt10(tokens):
    # Total sentiment score of file
    sent_sum_labmt10 = 0.0
    # Total number of occurences of words
    occ_sum_labmt10 = 0
    
    # For each token and associated number of occurences
    for token, occ in tokens.items():
        # If the token is in the given list of words with rated happiness
        for word, score in sent_list_labmt10:
            if token == word:
                sent_sum_labmt10 += (float(score) * occ)
                occ_sum_labmt10 += occ
    return sent_sum_labmt10 / occ_sum_labmt10


>And a function to calculate the sentiment for a concordance list found by nltk, and define a list of stopwords to be filtered out:

In [None]:
stopwords_labmt10 = nltk.corpus.stopwords.words('english')

In [None]:
# Finding sentiment from a concordance list
def con_sentiment_labmt10(con_list):
    sent_sum = 0
    line_num = 0
    for item in con_list:
        left = [ch.lower() for ch in item.left if ch.isalpha()]
        right = [ch.lower() for ch in item.right if ch.isalpha()]
        # Make left and right into one list and remove stopwords
        combined = [w for w in (left + right) if w not in stopwords_labmt10]
        
        # Make frequency distribution 
        fd = nltk.FreqDist(combined)
        sent_sum += sentiment_labmt10(fd)
        line_num += 1
    if line_num == 0:
        return None # Character had no appearences in chapter
    return sent_sum / line_num

>We then run our test as described previously:

In [None]:
# Define character list with the three characteres
char_list_labmt10 = ["Harry_Potter", "Severus_Snape", "Tom_Riddle"] 

# Create a list of the chapters of book 7
chapters_labmt10 = os.listdir("B7")

# Init a list to tuples (chraracter, [sentiments for each chapter for that character])
sentiment_by_character_labmt10 = []

# For each of the characters
for character in char_list_labmt10:
    sentiments = []
    # For each chapter
    for chapter in chapters_labmt10:
        # Read in and tokenize the chapter
        if "replaced" in chapter:
            with open("B7/" + chapter) as f:
                    raw = f.read()
            tokens = nltk.wordpunct_tokenize(raw)
            text = nltk.Text(tokens)
        
            # Make concordance for that character
            con = text.concordance_list(character)
        
            # Calculate sentiments and append to the list for that character
            sentiments.append(con_sentiment_labmt10(con))
    # Append the character with its full sentiment list
    sentiment_by_character_labmt10.append((character, sentiments))

# https://stackoverflow.com/questions/4971269/how-to-pick-a-new-color-for-each-plotted-line-within-a-figure-in-matplotlib
color = iter(cm.rainbow(np.linspace(0, 1, len(char_list_labmt10))))

for name, sentiments in sentiment_by_character_labmt10:
    c = next(color)
    plt.plot(sentiments, c=c, label=name)

plt.legend()
plt.xlabel('Chapter')
plt.ylabel('Average sentiment')
plt.title('Average sentiment by chapter for selected characters')
plt.figtext(.5, -0.1, f"Plot of the average sentiment for Harry, Snape, and Voldemort in book 7 when calculating sentiment from LabMT1.0.", ha="center")

plt_labmt = plt

### Sentiment for concordance of character, vaderSentiment
>For VADER we use the same approach and code, but we have to redefine how we calculate sentiment from concordance, since we just have to pass a string to the analyzer:

In [None]:
# Finding sentiment from a concordance list
def con_sentiment(con):
    sent_sum = 0
    line_num = 0
    combined = con.left + con.right
    combined = " ".join(combined)

    vs = analyzer.polarity_scores(combined)

    if vs == 0:
        return None # Character had no appearences in chapter
    return vs["compound"]

In [None]:
# Define character list with the three characteres
char_list_vader = ["Harry_Potter", "Severus_Snape", "Tom_Riddle"] 

# Create a list of the chapters of book 7
chapters_vader = os.listdir("B7")

# Init a list to tuples (chraracter, [sentiments for each chapter for that character])
sentiment_by_character_vader  = []

# For each of the characters
for character in char_list_vader :
    sentiments = []
    for chapter in chapters_vader :
        if "replaced" in chapter:
            with open("B7/" + chapter) as f:
                    raw = f.read()
            tokens = nltk.wordpunct_tokenize(raw)
            text = nltk.Text(tokens)
            cons = text.concordance_list(character)
            sent_sum = 0
            lines = 0
            for con in cons:
                sent_sum += con_sentiment(con)
                lines += 1
            if lines == 0:
                sentiments.append(None)
            else: 
                sentiments.append(sent_sum / lines)
    sentiment_by_character_vader .append((character, sentiments))

# https://stackoverflow.com/questions/4971269/how-to-pick-a-new-color-for-each-plotted-line-within-a-figure-in-matplotlib
color_vader  = iter(cm.rainbow(np.linspace(0, 1, len(char_list_vader ))))

for name, sentiments in sentiment_by_character_vader :
    c = next(color_vader)
    plt.plot(sentiments, c=c, label=name)

plt.axhline(y = 0.05, color ="purple", linestyle = '--', label="Neutral region")
plt.axhline(y = -0.05, color ="purple", linestyle = '--')

plt.legend()
plt.xlabel('Chapter')
plt.ylabel('Average sentiment')
plt.title('Average sentiment by chapter for selected characters')
plt.figtext(.5, -0.1, f"Plot of the average sentiment for Harry, Snape, and Voldemort in book 7 when calculating sentiment with VADER.", ha="center")

plt_vader = plt

### Conclusion: LabMT1.0 vs vaderSentiment
>Both results are similar, but we see an advantage in using VADER when we consider the sentiment for Harry Potter. Looking at the graphs we can see that there are similar trends for Harry throughout the book, but with VADER the sentiment becomes more consistent, in that it appears to be in the neutral region at some points, and then go out of it. On the other hand the MatLab1.0 seems to indicate that Harry is well above 5.1 throughout the book, which we would consider to be above neutral. Based on this preliminary test we believe that we will get a more clear picture from VADER.

## Functions for calculating sentiment
>The following section contains various functions we have defined to find and plot sentiment for various cases.

### ```sent_chars_book(char_list, path_to_book)```
>The function takes a list of character names and a path to a book. It computes the average sentiment of each character in the list throughout the book on a chapter basis. This can be used to find out how the sentiment of a single character changes throughout a book or a group of characters such as a house.

In [None]:
"""
Input: A list of character names to look for as tokens
        and a path to the chapters of the book
Output: A list of tuples, with the character name and a list
        of the sentiments for each chapter for that character
"""
def sent_chars_book_list(char_list, book_list):
    # Init list to hold tuples
    sentiment_of_character = [(n, []) for n in char_list]
    #print(sentiment_of_character)
    #print(len(sentiment_of_character))
    #print(char_list)
    
    chapter_counter = 0
    for book in book_list:
        
        # For each chapter
        for chapter in os.listdir(book):
            # Read in the chapter and tokenize
            with open(book + chapter) as f:
                raw = f.read()
            tokens = nltk.wordpunct_tokenize(raw)
            text = nltk.Text(tokens)
            
            character_counter = 0
            # For each character in the given list
            for character in char_list:
                # Make concordance for that character in that chapter
                cons = text.concordance_list(character)
                #print(character)
                #print(f"character = {character}, sentiment_of_character[{character_counter}] = {sentiment_of_character[character_counter]}")
                
                sent_sum = 0
                lines = 0
                # For each concordance line
                for con in cons:
                    # Calculate the sentiment for that concordance line
                    sent_sum += con_sentiment(con)
                    lines += 1
                if lines == 0:
                    # If there were no lines, the character did not appear
                    sentiment_of_character[character_counter][1].append(None)
                else: 
                    sentiment_of_character[character_counter][1].append(sent_sum / lines)
                if character_counter > len(sentiment_of_character):
                    print("!!!!!!!!!!!!!!!!!!!!!!!! WRONG")
                character_counter += 1
    # Returns a list of the sentiments for that character for each chapter of that book
    return sentiment_of_character

### ```sent_book(path_to_book)```
>Calculates the sentiment for a book on chapter basis. In this function each chapter of a book is read in and the sentiment for the text is calculated. This allows us to see how the sentiment for a book changes as it progresses.

In [None]:
"""
Input: The path to a book.
Output: A list of sentiments for each chapter of the book.
"""
def sent_book(path_to_book):
    chapters = os.listdir(path_to_book)

    # Making a data table (char_list)x(num_chapters) to hold sentiments for each char in each chap
    # tuple list with tuples (character, [sent chapter1, sent chapter2, ...])
    sentiments_by_chapter = []
    
    for chapter in chapters:
        if "replaced" in chapter:
            with open(path_to_book + chapter) as f:
                raw = f.read()
            sentiments_by_chapter.append(analyzer.polarity_scores(raw)["compound"])
    # Returns a list of the sentiments for that character for each chapter of that book
    return sentiments_by_chapter

### ```sent_group(group, label, book_list)```
>This function calculates the average sentiments for all of the names given in ```group``` and returns it as as the sentiment for the name from ```label_group```. The sentiments are calculated from the books given in ```book_list```. e.g. given the list of names of Gryffindor students, with label "Gryffindor" returns a tuple ("Gryffindor", sentiment list), where the sentiment list contains the average sentiment for each chapter for those students.

In [None]:
def avgsent_group(group, label, book_list):
    # Count number of chapters
    num_chapters = 0
    for book in book_list:
        
        num_chapters += len(os.listdir(book))
    #print(f"num_chapters = {num_chapters}")
    sent_chapters = [0] * num_chapters
    sent_group = (label, sent_chapters)
    
    #print(f"length of sent_chapters = {len(sent_chapters)}")
    #print(f"length of sent_group[1] = {len(sent_group[1])}")
    
    # Counter for current chapter
    c = 0
    
    # For each book in the list
    for book in book_list:  
        # For each chapter going by numbering
        for chapter in os.listdir(book):
            # Init counter for counting occurences in chapter for average
            occurences = 0
            
            # Read in the chapter
            with open(book + chapter) as f:
                    raw = f.read()
            tokens = nltk.wordpunct_tokenize(raw)
            # Prepare nltk text
            text = nltk.Text(tokens)
            
            # For each member of the group
            for member in group:
                # Make concordance for that member for that chapter
                con_list = text.concordance_list(member)
                
                # For each concordance line in the list
                for con in con_list:
                    # Calculate the sentiment
                    sent = con_sentiment(con)
                    if sent != 0:
                        # Sum up the sentiment for that chapter for that member
                        # with sentiments for all other members of group
                        #print(f"c = {c}")
                        sent_group[1][c] += sent
                        occurences += 1
            # Divide by the total number of occurences 
            if occurences == 0:
                sent_group[1][c] = None
            else:
                sent_group[1][c] = sent_group[1][c] / occurences
            c += 1
    return sent_group                    

### ```plot_sentiments(sentiment_by_character, figure_text, xs_vertical_lines, show_labels)```
>The functions takes a list of tuples, where each tuple contains a name and a list of sentiments. A figure text, a list of tuples for placing vertical lines with labels, and a ```True```/```False```flag for show labels. This function may take the result of ```sent_char_books``` a sinput for the list of tuples with names and sentiments. This allows for fast and simple plotting.

In [None]:
"""
Input: A list of tuples containing a name and a sentiment list, a figure text, 
       a list of tuples with labels and coordinates for vertical lines, and a
       true/false value for showing labels.
Output: void, shows a plot
"""
def plot_sentiments(sentiment_by_character, figure_text, xs_vertical_lines, show_legend):
    # Init iterator
    color = iter(cm.rainbow(np.linspace(0, 1, len(sentiment_by_character)*3)))
    plt.xlim(0, len(sentiment_by_character[0][1]))
    
    for name, sentiments in sentiment_by_character:
        # Try to give colors according the name associated with the sentiments
        if ("Gryffindor" in name):
            c = "red"
            a = 1
        elif ("Slytherin" in name and name != "Salazar_Slytherin"):
            c = "green"
            a = 1
        elif ("Hufflepuff" in name):
            c = "yellow"
            a = 1
        elif ("Ravenclaw" in name):
            c = "blue"
            a = 1
        elif ("Average" in name):
            c = "black"
            a = 1
        else:
            c = next(color)
            a = 0.5
        plt.plot(sentiments, c=c, alpha=a, label=name)
    
    # Make horizontal lines to indicate the neutral region
    plt.axhline(y = 0.05, color ="purple", linestyle = '--')
    plt.axhline(y = -0.05, color ="purple", linestyle = '--')
    
    # If vertical lines for book has been specified insert them
    for book, vertical_line in xs_vertical_lines:
        plt.axvline(x = vertical_line, color = 'black')

    # If legend has been requested
    if show_legend:
        plt.legend()

    plt.xlabel('Chapter')
    plt.ylabel('Average sentiment')
    plt.figtext(.5, -0.1, figure_text, ha="center")

    plt.show()

## Sentiment for books
>First we want to explore how the sentiment is throughout the books. We have two ideas for measuring this: Take each chapter as a text and have VADER analyze the sentiment of that text for us. Or for each character from our list of characters, make concordance for each chapter for that character, and divide it by the total number of concordance lines for that chapter. We are going to test these two methods out to see which is more expressive:

>We start by defining some list needed for plotting and reading in all the books:

In [None]:
# Tuple list containing the first chapter of the next book, e.g. ("B1", 18), indicates
# all chapters up to 18 excluded are from book 1
book_list_wchapter = [("B1", 18), ("B2", 35), ("B3", 57), ("B4", 94), ("B5", 132), ("B6", 162)]
# Lists of paths to the folders holding the text from the chapters
book_list = ["B1/", "B2/", "B3/", "B4/", "B5/", "B6/", "B7/"]

In [None]:
# Init list to hold sentiment values
series_sent = []
# Compute and append the sentiment values for the seven books
series_sent += sent_book("B1/")
series_sent += sent_book("B2/")
series_sent += sent_book("B3/")
series_sent += sent_book("B4/")
series_sent += sent_book("B5/")
series_sent += sent_book("B6/")
series_sent += sent_book("B7/")

In [None]:
# Figure text
sent_by_chapter = "Sentiment by chapter for entire series analyzed one chapter at a time"
# Adjust for a wider figure size
plt.rcParams['figure.figsize'] = [15, 5]
# Plot the sentiment values for all of the books when VADER analyzed each chapter as a whole text
plot_sentiments([("Sentiment for series", series_sent)], sent_by_chapter, book_list_wchapter, False)

>The above figure does not convey changes in the book very well, at most it seems that we can get an idea of the overall tone of the chapter, but not how it relates to the other chapters or how the story evolves and changes. 

>For the next part we are going to try the approach with making concordance for all of the characters for each chapter and taking the average:

In [None]:
# Create character tuple list
characters = []

# Read the character with their attributes from our .csv file
with open("HP_characters.csv", "r", encoding="utf8") as sent_file:
    csv_reader_characters = csv.reader(sent_file, delimiter=",")
    for row in csv_reader_characters:
        name = row[0].replace(' ', '_')
        parentage = row[1]
        house = row[2]
        occupation = row[3]
        characters.append((name, parentage, house, occupation))

In [None]:
# Extract the character names from our list with attributes
character_names = [n for n, b, h, o in characters]

In [None]:
# Calculate the average sentiment for each chapter
avgsent_all = avgsent_group(character_names, "Average sentiment of all characters", book_list)

In [None]:
sentiment_books = "Average sentiment from concordance of character names throughout the books."
plot_sentiments([avgsent_all], sentiment_books, book_list_wchapter, True)

## Sentiment for houses

In [None]:
gryffindors = [n for n, p, h, o in characters if h == "Gryffindor"]
hufflepuffs = [n for n, p, h, o in characters if h == "Hufflepuff"]
ravenclaws = [n for n, p, h, o in characters if h == "Ravenclaw"]
slytherins = [n for n, p, h, o in characters if h == "Slytherin"]

In [None]:
print(f"Number of characters from Gryffindor: {len(gryffindors)}")
print(f"Number of characters from Hufflepuff: {len(hufflepuffs)}")
print(f"Number of characters from Ravenclaw: {len(ravenclaws)}")
print(f"Number of characters from slytherin: {len(slytherins)}")

In [None]:
avgsent_gryffindor = avgsent_group(gryffindors, "Gryffindors", book_list)

In [None]:
avgsent_gryffindor = avgsent_group(gryffindors, "Gryffindors", book_list)
avgsent_hufflepuff = avgsent_group(hufflepuffs, "Hufflepuffs", book_list)
avgsent_ravenclaw = avgsent_group(ravenclaws, "Ravencalws", book_list)
avgsent_slytherin = avgsent_group(slytherins, "Slytherins", book_list)

In [None]:
avg_sent_list = [avgsent_gryffindor, 
                 avgsent_slytherin,
                 avgsent_hufflepuff,
                 avgsent_ravenclaw]
plot_sentiments(avg_sent_list, "Average sentiment for the four houses throughout the books", book_list_wchapter, True)

In [None]:
label = "Gryffindors"
gryff_sents = sent_chars_book_list(gryffindors, book_list) + [avgsent_gryffindor]

In [None]:
text_gryff_sent = "Average sentiments for Gryffindor, and sentimens for all Gryffindors"
plot_sentiments(gryff_sents, text_gryff_sent, book_list_wchapter, False)

In [None]:
# Looking at graph above maybe purple does not follow the trend?
# maybe purple is dumbledore, trying to remove:
tup_dict = dict(gryff_sents)
tup_dict.pop('Albus_Dumbledore')
tup_dict.pop('Gryffindors')
gryff_sent = list(tuple(tup_dict.items()))

# Dumbledore has not been removed?

#gryffindors.remove("Albus_Dumbledore")
gryff_sent = gryff_sent + [avgsent_group(gryffindors, "Gryffindors", book_list)]
plot_sentiments(gryff_sents, text_gryff_sent, book_list_wchapter, False)

In [None]:
label = "Slytherins"
slyth_sents = sent_chars_book_list(slytherins, book_list) + [avgsent_slytherin]

In [None]:
text_slyth_sent = "Average sentiments for Slytherin, and sentimens for all Slytherins"
plot_sentiments(slyth_sents, text_slyth_sent, book_list_wchapter, False)

In [None]:
label = "Hufflepuffs"
huff_sents = sent_chars_book_list(hufflepuffs, book_list) + [avgsent_hufflepuff]

In [None]:
# Unexpected plot, hvorfor er alle kapitler fra første bog ikke med? Jf. streg for første bog ovenfor
text_huff_sent = "Average sentiments for Hufflepuff, and sentimens for all Hufflepuffs"
plot_sentiments(huff_sents, text_huff_sent, book_list_wchapter, False)

In [None]:
label = "Ravenclaws"
rave_sents = sent_chars_book_list(ravenclaws, book_list) + [avgsent_ravenclaw]

In [None]:
# Unexpected plot, hvorfor er der ikke plot fra 0 og frem?
text_rave_sent = "Average sentiments for Ravenclaw, and sentimens for all Ravenclaws"
plot_sentiments(rave_sents, text_rave_sent, book_list_wchapter, False)

## Highest and lowest sentiments
>Investigating which characters have the highest and lowest sentiments by summing up their average sentiment values for each chapter they appeared in, and dividing by the number of chapters they appeared in into a sentiment score for that character.

In [None]:
avgsent_all_individual = sent_chars_book_list(character_names, book_list)
character_sent_scores = []
for name, sent_list in avgsent_all_individual:
    sent_score = 0
    sent_sum = 0
    chapter_occurences = 0
    for sent in sent_list:
        if sent != None:
            sent_sum += sent
            lines += 1
    sent_score = sent_sum
    character_sent_scores.append((name, sent_score))

In [None]:
# https://bobbyhadz.com/blog/python-sort-list-of-tuples-by-second-element
sorted_list = sorted(
    character_sent_scores,
    key=lambda t: t[1]
)
top_names = [n for n, s in sorted_list[-5:]]
top = [(n, l) for n, l in avgsent_all_individual if n in top_names]
bottom_names = [n for n, s in sorted_list[0:5]]
bottom = [(n, l) for n, l in avgsent_all_individual if n in bottom_names]

In [None]:
top_text = "Sentiment values of 5 highest sentiment, with total average for comparison."
plot_sentiments([avgsent_all] + top, top_text, book_list_wchapter, True)

In [None]:
bottom_text = "Sentiment values of 5 lowest sentiment, with total average for comparison."
plot_sentiments([avgsent_all] + bottom, bottom_text, book_list_wchapter, True)

## Sentiment for main characters
>Who are the main characters, definetely Harry, Ron, Hermione. 

In [None]:
main_character_list = ["Harry_Potter", "Ronald_Weasley", "Hermione_Granger"]
main_characters = [(n, l) for n, l in avgsent_all_individual if n in main_character_list]
bottom_text = "Sentiment values of 5 lowest sentiment, with total average for comparison."
plot_sentiments([avgsent_all] + main_characters, bottom_text, book_list_wchapter, True)