In [None]:
import os
import nltk
import csv
import math
import numpy as np
import matplotlib.pyplot as plt

import networkx as nx
from networkx.algorithms import community

from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud

from bs4 import BeautifulSoup
from community import community_louvain
from fa2 import ForceAtlas2

In [None]:
# Create character tuple list

characters = []

with open("HP_characters.csv", "r", encoding="utf8") as sent_file:
    csv_file = csv.reader(sent_file, delimiter=",")
    for row in csv_file:
        name = row[0].replace(' ', '_')
        # lower to make sure they are spelled the same way
        parentage = row[1].lower()
        house = row[2].lower()
        occupation = row[3].lower()
        loyalty = row[4]
        characters.append((name, parentage, house, occupation, loyalty))

# Graph with weights
Making a graph for each book. The nodes are the characters in the book, and edges are between characters that are in the same chapter. Edges have weight corresponding to the number of times those two characters are in the same chapter. Nodes have the attributes parentage, house, occupation and loyalty. 

In [None]:
"""
Input: A list of character names with their attributes, 
       the path of the book,
       how many sentences to look at at time
Output: A weighted graph
"""
def weighted_temporal_graphs(character_list, path, sentence_no):
    
    G = nx.Graph()
  
    # Go throug each chapter in the book
    for chapter in os.listdir(path): 
        #only look at the files where aliases have been replaced with character names
        if "replaced" in chapter:   

            # Get text
            with open(path + chapter) as f:
                text = f.read()
            
            # Put all characters from the chapter in the graph if they are not already there
            for character in character_list:
                if character[0] in text and character[0] not in list(G.nodes):
                    G.add_node(character[0], parentage = character[1], 
                               house = character[2], occupation = character[3], loyalty = character[4])
            
            # Split the text in sentences 
            sentences = text.split(". ")
            count_start = 0
            count_end = sentence_no
            
            # Look at specified amount of senteces at a time
            while (count_start < len(sentences)):
                current = sentences[count_start:count_end]
                current = " ".join(current)
                
                # Go through the nodes and check if two diffferent nodes appear in the same text piece
                # if so add an edge
                # weight is the amount of times they appear together throughout the book
                for character_source in list(G.nodes):
                    #print(character_source)
                    for character_target in list(G.nodes):
                        #print(character_target)
                        if character_source is character_target:
                            continue
                        elif (character_source in current and character_target in current):
                            if G.has_edge(character_source, character_target):
                                G[character_source][character_target]['weight'] += 1
                            else:
                                G.add_edge(character_source, character_target, weight=1)
                            #print("added some edge")
                count_start = count_end
                count_end += sentence_no
    # Remove nodes without edges
    print(list(nx.isolates(G)))
    G.remove_nodes_from(list(nx.isolates(G)))
    #print("Done with graph")
    return G

In [None]:
#https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.operators.binary.compose.html#networkx.algorithms.operators.binary.compose

"""
Input: two graphs to combine
Output: the combined graph, including the weights of the edges added together if same edges
"""

def combine_graphs(g1, g2):
    combined= nx.compose(g1, g2)
    edge_data = {e: g1.edges[e]['weight'] + g2.edges[e]['weight'] 
                 for e in g1.edges & g2.edges}
    nx.set_edge_attributes(combined, edge_data, 'weight')
    
    return combined


In [None]:
""" 
Input: a graph
Output: A list of of the summed weights for the edges for each node,
        this list is ordered as the list of nodes returned from graph.nodes
"""
def get_weight_sums(graph):
    weight_sums = []
    for node in list(graph.nodes):
        sum = 0
        for source, target in list(graph.edges):
            if node is source or node is target:
                sum += graph[source][target]["weight"]
        weight_sums.append(sum)
    return weight_sums

In [None]:
# https://stackoverflow.com/questions/5294955/how-to-scale-down-a-range-of-numbers-with-a-known-min-and-max-value
"""
Input: a = minmum value for scaled weights
       b = maximum value for scaled weights
       G = graph
Output: A list with the scaled weights
"""

def scaled_weights(a, b, G):
    weights = get_weight_sums(G)
    max_weight = max(weights)
    min_weight = min(weights)
    scaled = []
    for w in weights:
        scaled.append(((b - a) * (w - min_weight) // (max_weight - min_weight)) + a)
    return scaled

In [None]:
# https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list

"""
Input: G = graph
       n = Number of nodes from top to bottom
       f = flag, True = top, False = bottom
Output: A list with names of hte n nodes
        A list of node sizes as a tuple
        A list of the indices
"""

def get_nodes_extreme(G, n, f):
    if f:
        n_indices = np.argsort(get_weight_sums(G))[-n:]
    else:
        n_indices = np.argsort(get_weight_sums(G))[0:n]
    sc_weights = scaled_weights(50, 800, G)
    names = []
    weights = []
    indices = []
    for i in n_indices:
        names.append(list(G.nodes())[i])
        weights.append(sc_weights[i])
        indices.append(i)
    return (names, weights, indices)

In [None]:
# Colors to use when drwaing the communities
community_colors = ["#B6F20D", "#0DF2BC", "#490DF2", "#F20D43", "#13EC33", "#1360EC", "#EC13CC", "#EC9F13"]

In [None]:
"""
Input: a graph,
       title for the plot, default is empty string
Output: plots the graph using forceAtlas

"""

def draw_network(graph, title=""):
    # Adjusting figure size
    plt.rcParams['figure.figsize'] = [10, 10]

    forceatlas2 = ForceAtlas2(
                            # Behavior alternatives
                            outboundAttractionDistribution=True,  # Dissuade hubs
                            linLogMode=False,  # NOT IMPLEMENTED
                            adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                            edgeWeightInfluence=1.0,

                            # Performance
                            jitterTolerance=1.0,  # Tolerance
                            barnesHutOptimize=True,
                            barnesHutTheta=1.2, # original 1.2
                            multiThreaded=False,  # NOT IMPLEMENTED

                            # Tuning
                            scalingRatio=2.0,
                            strongGravityMode=True,
                            gravity=0.1, # original 0.5

                            # Log
                            verbose=True)

    positions = forceatlas2.forceatlas2_networkx_layout(graph, pos=None, iterations=2000)
    nx.draw_networkx_edges(graph, positions, edge_color="black", alpha=0.1)
       
    # Making 3 lists: top n max_weights, bottom n min_weights, rest
    max_nodes, max_sizes, max_indices = get_nodes_extreme(graph, 10, True)
    min_nodes, min_sizes, min_indices = get_nodes_extreme(graph, 10, False)
    
    rest_nodes = [n for n in list(graph.nodes()) if n not in max_nodes and n not in min_nodes]
    rest_sizes = []
    indices_to_remove = max_indices + min_indices
    G_scaled_weights = scaled_weights(50, 800, graph)
    for i in range(len(G_scaled_weights)):
        if i not in indices_to_remove:
            rest_sizes = G_scaled_weights[i]
            
    nx.draw_networkx_nodes(graph, positions, nodelist=rest_nodes, node_color='#efbc2f', node_size=rest_sizes,edgecolors = 'black', alpha=1)
    nx.draw_networkx_nodes(graph, positions, nodelist=min_nodes, node_color='#366447', node_size=min_sizes, edgecolors = 'black', alpha=1)
    nx.draw_networkx_nodes(graph, positions, nodelist=max_nodes, node_color='#a6332e', node_size=max_sizes, edgecolors = 'black', alpha=1)
    
    plt.axis('off')
    plt.title(title)
    plt.figtext(.5, -0.05, f"The size of a note indicates the scaled sum of its weights.", ha="center")
    # Used to save the fig for the paper
    plt.savefig('networkCombined.png', format='png', transparent=True)
    plt.show()

In [None]:
# Making graphs of book 1 where the no of senteces are changed

graph_list = []
graph_list.append(weighted_temporal_graphs(characters, "B1/", 5))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 10))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 20))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 30))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 40))
graph_list.append(weighted_temporal_graphs(characters, "B1/", 50))



In [None]:
# plotting the networks with different no of sentences
sentence_len = [5, 10, 20, 30, 40, 50]

for i, graph in enumerate(graph_list):
    draw_network(graph, "Book one network with interval of "+str(sentence_len[i])+" sentences")

In [None]:
# Making network separately for each book

book_graphs = []

book_graphs.append(weighted_temporal_graphs(characters, "B1/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B2/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B3/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B4/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B5/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B6/", 5))
book_graphs.append(weighted_temporal_graphs(characters, "B7/", 5))

In [None]:
# Used to create a figure for the paper
draw_network(book_graphs[0], "Network of book 1")

In [None]:
# Drawing network of each book
for i, graph in enumerate(book_graphs):
    draw_network(graph, "Network of book " + str(i+1))

In [None]:
# Combining the networks of the books
combined_nx = [book_graphs[0]]
combined_nx.append(combine_graphs(combined_nx[0], book_graphs[1]))
combined_nx.append(combine_graphs(combined_nx[1], book_graphs[2]))
combined_nx.append(combine_graphs(combined_nx[2], book_graphs[3]))
combined_nx.append(combine_graphs(combined_nx[3], book_graphs[4]))
combined_nx.append(combine_graphs(combined_nx[4], book_graphs[5]))
combined_nx.append(combine_graphs(combined_nx[5], book_graphs[6]))

In [None]:
# Used to create a figure for the paper
draw_network(combined_nx[6], "Combined network of all books")

In [None]:
# Drawing network of combined books
for i, graph in enumerate(combined_nx):
    title = ''
    if i == 0:
        title = "Network of book 1"
    else:
        title = "Network of book 1-" + str(i+1) 
    draw_network(graph, title)

In [None]:
# Print number of nodes and edges in each combined network

for i, graph in enumerate(combined_nx):
    title = ''
    if i == 0:
        title = " edges in the network of book 1"
    else:
        title = " edges in the network of book 1-" + str(i+1) 
    print('There are ' +str(graph.number_of_nodes()) + ' nodes and ' + str(graph.number_of_edges()) + title)

In [None]:
# Finding the character with biggest increase in percentage of its edges
# Very hacky solution
edge_increase = []
startG = book_graphs[0]
endG = combined_nx[6]
for char in characters:
    edgeStart = len(startG.edges(char[0]))
    if edgeStart == 0:
        edgeStart = len(book_graphs[1].edges(char[0]))
        if edgeStart == 0:
            edgeStart = len(book_graphs[2].edges(char[0]))
            if edgeStart == 0:
                edgeStart = len(book_graphs[3].edges(char[0]))
                if edgeStart == 0:
                    edgeStart = len(book_graphs[4].edges(char[0]))
                    if edgeStart == 0:
                        edgeStart = len(book_graphs[5].edges(char[0]))
                        if edgeStart == 0:
                            edgeStart = len(book_graphs[6].edges(char[0]))
    if edgeStart == 0:
        print(char[0])
        continue
    edgeEnd = len(endG.edges(char[0]))
    increase = edgeEnd-edgeStart
    percentage = increase*100/edgeStart
    edge_increase += [(char[0], edgeStart, edgeEnd, percentage)]

print(max(edge_increase,key=lambda item:item[3]))

In [None]:
# Used to find info for the paper
print(len(book_graphs[0].edges('Harry_Potter')))
print(len(combined_nx[6].edges('Harry_Potter')))
print(len(book_graphs[0].edges('Ronald_Weasley')))
print(len(combined_nx[6].edges('Ronald_Weasley')))
print(len(book_graphs[0].edges('Hermione_Granger')))
print(len(combined_nx[6].edges('Hermione_Granger')))

In [None]:
# Plot of the increase of edges and nodes throughout the books. With different y-axes in same plot. 
# Not used in paper as it was difficult to read
no_nodes = []
no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]
for graph in combined_nx:
    no_nodes.append(graph.number_of_nodes())
    no_edges.append(graph.number_of_edges())

ax1 = plt.subplot()
l1, = ax1.plot(networks, no_nodes, color='red')
ax2 = ax1.twinx()
l2, = ax2.plot(networks, no_edges, color='blue')
ax1.set_xlabel('No. of books combined')
ax1.tick_params(axis="y", labelcolor='red')
ax2.tick_params(axis="y", labelcolor='blue')
ax1.set_ylabel("No. of nodes")
ax2.set_ylabel("No. of edges")

plt.legend([l1, l2], ["No. of nodes", "No. of edges"])

plt.show()

In [None]:
# Plotting number of edges belong to ten characters throughout the series:
char = ["Harry_Potter", "Ronald_Weasley", "Hermione_Granger", "Albus_Dumbledore", "Severus_Snape", "Tom_Riddle", 
        "Rubeus_Hagrid", "Draco_Malfoy", "Ginevra_Weasley", "Neville_Longbottom"]
font = 15
no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]
plt.rcParams["figure.figsize"] = (10,11)
for c in char:
    edges = []
    for graph in combined_nx:
        edges.append(len(list(graph.edges(c))))
    no_edges.append(edges)

colors = ['#a6332e', '#efbc2f', '#3c4e91', '#366447', '#aaaaaa', '#946b2d', 'orchid', '#d3a625', 'orangered', 'green']
for i, e_list in enumerate(no_edges):
    plt.plot(networks, e_list, label = char[i].replace('_', ' '), color = colors[i])
plt.xlabel("No. of books combined", fontsize = font)
plt.ylabel("No. of edges", fontsize = font)
plt.legend(fontsize = font)
plt.xticks(fontsize=font)
plt.yticks(fontsize=font)
plt.title("Evolution of the network of ten main characters")
plt.savefig('plotMainCharacters.png', format='png', transparent=True)
plt.show()

In [None]:
# Plot showing how many nodes there are in each book and for the books combined
no_bookNodes = []
no_combinedNodes = []
font = 20
#no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]
for graph in combined_nx:
    no_combinedNodes.append(graph.number_of_nodes())
for graph in book_graphs:   
    no_bookNodes.append(graph.number_of_nodes())
    
counts, bins = np.histogram(no_combinedNodes, bins = 7)

plt.plot(networks, no_combinedNodes, color = '#efbc2f', alpha = 1, label = 'Combined books')
plt.bar(networks, no_bookNodes, color = '#3c4e91', edgecolor = 'black', alpha = 0.8)
plt.xlabel('Book no.', fontsize=font)
plt.ylabel('Number of nodes', fontsize=font)
plt.legend(fontsize = font)
plt.xticks(fontsize=font)
plt.yticks(fontsize=font)
plt.savefig('plotNodes.png', format='png', transparent=True)
plt.show()

In [None]:
# Plot showing how many edges there are in each book and for the books combined
no_bookEdges = []
no_combinedEdges = []
#no_edges = []
networks = [1, 2, 3, 4, 5, 6, 7]

font = 15
for graph in combined_nx:
    no_combinedEdges.append(graph.number_of_edges())
for graph in book_graphs:   
    no_bookEdges.append(graph.number_of_edges())

plt.plot(networks, no_combinedEdges, color = '#366447', alpha = 1, label = 'Combined books')
plt.bar(networks, no_bookEdges, color = '#a6332e', edgecolor = 'black', alpha = 0.8)
plt.xlabel('Book no.', fontsize=font)
plt.ylabel('Number of nodes', fontsize=font)
plt.legend(fontsize = font)
plt.xticks(fontsize=font)
plt.yticks(fontsize=font)
plt.savefig('plotEdges.png', format='png', transparent=True)
plt.show()

## Communities

In [None]:
"""
Input: a graph to divide in communities
Output: A list with the different communities
"""

def communities(graph):
    partition = community_louvain.best_partition(graph)
    #print(partition)
    partition_list = []
    
    for com in set(partition.values()) :
        list_nodes = [nodes for nodes in partition.keys()
                                if partition[nodes] == com]
        partition_list.append(list_nodes)
    partition_list = sorted(partition_list, key=len, reverse=True)
    #print(partition_list)
    return partition_list
    

In [None]:
#Make dictionary wordclouds with the character names and their weights
# equal attributes are summarized
# Was used to try and make wordclouds with the names of 
#the characters in the communities and their corresponding attributes
#But this did not end up as we wanted so not used in the paper

"""
Input: a community and the graph it is extracted from
Output: A dictionary of all the characters in the community,
        the parentages, houses and occupations belonging to the characters.
        Each character gets a value according to their sum
        Each parentage, house and accupation is summed up for the total no. of characters belonging
        to that parentage, house or occupation.
"""

def wordcloud_dict(community, graph):
    cloud_freq = {}   
    subG = graph.subgraph(community)
    nodes = list(subG.nodes)
    weights = get_weight_sums(subG)
    parentages = nx.get_node_attributes(subG, 'parentage')
    houses = nx.get_node_attributes(subG, 'house')
    occupations = nx.get_node_attributes(subG, 'occupation')
    
    
    for character in community:
        parentage = parentages[character]
        house = houses[character]
        occupation = occupations[character]
        
        cloud_freq[character.replace('_', ' ')] = weights[nodes.index(character)]
        
        if parentage != 'other':
            if parentage in cloud_freq:
                cloud_freq[parentage] = cloud_freq.get(parentage) + 1
            else:
                cloud_freq[parentage] = 1
        
        if house != 'other':
            if house in cloud_freq:
                cloud_freq[house] = cloud_freq.get(house) + 1
            else:
                cloud_freq[house] = 1
        
        if occupation != 'other':
            if occupation in cloud_freq:
                cloud_freq[occupation] = cloud_freq.get(occupation) + 1
            else:
                cloud_freq[occupation] = 1
    
    return cloud_freq

In [None]:
# Was used to try and make wordclouds with the names of 
#the characters in the communities and their corresponding attributes
#But this did not end up as we wanted so not used in the paper

"""
Input: a list of communities and the graph they're extracted from
Output: a list with a dictionary for each community
"""

def make_com_dicts(com_list, graph):
    com_dicts = []

    for com in com_list:
        com_dicts.append(wordcloud_dict(com, graph))
    
    return com_dicts
    

In [None]:
# Was used to try and make wordclouds with the names of 
#the characters in the communities and their corresponding attributes
#But this did not end up as we wanted so not used in the paper

"""
Input: a list of dictionaries
Output: Wordclouds plotted for the community dictionaries given
"""
def draw_word_cloud(dicts):
    fig = plt.figure()
    plt.rcParams['figure.figsize'] = [15, 20]

    for i in range(len(dicts)):
        ax = fig.add_subplot(5,2,i+1)
        wordcloud = WordCloud(background_color='black', width=2200,
                          height=1800, collocations=False, 
                              # Different colormaps https://matplotlib.org/stable/tutorials/colors/colormaps.html
                          colormap = plt.get_cmap('hsv', max(partition.values()) + 30)
                        ).generate_from_frequencies(dicts[i])

        ax.imshow(wordcloud)
        ax.axis('off')
    plt.show()

In [None]:
# Communitites from all books
all_communities = communities(combined_nx[6])

In [None]:
# communities in each book:

book_communities = []

for graph in book_graphs:
    book_communities.append(communities(graph))

In [None]:
# print which book, the communities and the size of each community

for i, com in enumerate(book_communities):
    print('Book no.: ' + str(i+1))
    print(com)
    for part in com:
        print(len(part))

In [None]:
# All those are not used. Was to see how it would turn out with using the names and attributes

# Wordclouds for all books combined
dictionary = make_com_dicts(all_communities, combined_nx[6])
draw_word_cloud(dictionary)

In [None]:
#Wordclouds for book 1:
dictionary1 = make_com_dicts(book_communities[0], book_graphs[0])
draw_word_cloud(dictionary1)

In [None]:
#Wordclouds for book 2:
dictionary2 = make_com_dicts(book_communities[1], book_graphs[1])
draw_word_cloud(dictionary2)

In [None]:
#Wordclouds for book 3:
dictionary3 = make_com_dicts(book_communities[2], book_graphs[2])
draw_word_cloud(dictionary3)

In [None]:
#Wordclouds for book 4:
dictionary4 = make_com_dicts(book_communities[3], book_graphs[3])
draw_word_cloud(dictionary4)

In [None]:
#Wordclouds for book 5:
dictionary5 = make_com_dicts(book_communities[4], book_graphs[4])
draw_word_cloud(dictionary5)

In [None]:
#Wordclouds for book 6:
dictionary6 = make_com_dicts(book_communities[5], book_graphs[5])
draw_word_cloud(dictionary6)

In [None]:
#Wordclouds for book 7:
dictionary7 = make_com_dicts(book_communities[6], book_graphs[6])
draw_word_cloud(dictionary7)

### Extracting texts from books belonging to communities

In [None]:
"""
Input: A list of communities.
       the directory of the book,
       how many sentences to read at a time
Output: A list with the strings belonging to each community
"""


def extract_com_texts(com_list, book_dir, sentence_no):
    community_texts = []
    maxrange = 0
    #if we have less than 10 communities
    if len(com_list) < 10:
        maxrange = len(com_list)
    else:
        maxrange = 10


    for community in com_list[:maxrange]:
        
        com_txt = []        
        for chap in os.listdir(book_dir):
            #if chap == 'replaced_ch1.txt':
            with open(book_dir + chap) as f:
                    text = f.read()

            sentences = text.split(". ")
            count_start = 0
            count_end = sentence_no
            add_text_start = []
            
            while (count_start < len(sentences)):
                    current = sentences[count_start:count_end]
                    current = " ".join(current)
                    for char in community:
                        if char in current:
                            # If we haven't already added the textpiece to this community, then add it
                            # To make sure that we won't get the same textpiece several times in one community
                            if not count_start in add_text_start:
                                tokens = word_tokenize(current)
                                com_txt = com_txt + tokens
                                add_text_start.append(count_start)                                      
                    count_start = count_end
                    count_end += sentence_no
                    
        community_texts.append(com_txt)

    stopwords = nltk.corpus.stopwords.words('english')

    community_strings = []
    for txt in community_texts:
        com_words = [w for w in txt if w not in stopwords and len(w)>1]
        community_strings.append(com_words)

    return community_strings
    

In [None]:
# Find the strings for the communities in each book
com_strings1 = extract_com_texts(book_communities[0], './B1/', 5)
com_strings2 = extract_com_texts(book_communities[1], './B2/', 5)
com_strings3 = extract_com_texts(book_communities[2], './B3/', 5)
com_strings4 = extract_com_texts(book_communities[3], './B4/', 5)
com_strings5 = extract_com_texts(book_communities[4], './B5/', 5)
com_strings6 = extract_com_texts(book_communities[5], './B6/', 5)
com_strings7 = extract_com_texts(book_communities[6], './B7/', 5)


### Extract wikitext for each community

In [None]:
# Function used to find the wikitexts belonging to each character in a community 
# Not used as we went with the text from the books instead
def extract_com_wikitexts(com_list, directory):
    community_texts = []
    maxrange = 0
    #if we have less than 10 communities
    if len(com_list) < 10:
        maxrange = len(com_list)
    else:
        maxrange = 10


    for community in com_list[:maxrange]:
        
        com_txt = []  
        
        for char in community:
            with open(directory + 'clean_' + char + '.txt') as f:
                    text = f.read()
            
            tokens = nltk.word_tokenize(BeautifulSoup(text, 'html.parser').get_text())
            #tokens = word_tokenize(current)
            #file_text = [w.lower() for w in tokens if w.isalpha()]
            com_txt = com_txt + tokens
            
        community_texts.append(com_txt)

    stopwords = nltk.corpus.stopwords.words('english')

    community_strings = []
    for txt in community_texts:
        com_words = [w for w in txt if w.lower() not in stopwords and len(w)>2]
        community_strings.append(com_words)

    return community_strings

In [None]:
com_wikistrings1 = extract_com_wikitexts(book_communities[0], './characters/')

In [None]:
"""
Input: List of community strings
Output: A list with the unique terms for each community
"""
def unique(com_str):
    unique_terms = []
    for community_words in com_str:
        unique_terms.append(list(set(community_words)))
    
    return unique_terms

In [None]:
unique_terms1 = unique(com_strings1)
unique_terms2 = unique(com_strings2)
unique_terms3 = unique(com_strings3)
unique_terms4 = unique(com_strings4)
unique_terms5 = unique(com_strings5)
unique_terms6 = unique(com_strings6)
unique_terms7 = unique(com_strings7)

In [None]:
unique_wikiterms1 = unique(com_wikistrings1)

# TF-IDF

In [None]:
"""
Input: A word,
       a list with the uniqe term for each community
Output: The IDF value foudn for the word
"""

def idf(word, unique_list):
    N = len(unique_list)
    term_appears = 0
    for sublist in unique_list:
        if word in sublist:
            term_appears+=1
    idf_val = math.log(N/(1+term_appears))+1
    return idf_val

In [None]:
"""
Input: A list with the community string,
       a list with the uniqe terms for each community
Output: A list containg the words and their TF-IDF values for each community
"""

def tfidf(community_str, unique_words):
    tfidf_list = []

    for community_words in community_str:
        fdist = FreqDist(community_words)
        total_terms = len(community_words)
        tfidf=[]
        for word in fdist:
            idf_val = idf(word, unique_words)
            tf_val = fdist[word]/total_terms
            tfidf_elem=(word, tf_val*idf_val)
            tfidf.append(tfidf_elem)
        tfidf_list.append(tfidf)
        
    return tfidf_list

In [None]:
# Creating the tfidf list for each book
tfidf_list1 = tfidf(com_strings1, unique_terms1)
tfidf_list2 = tfidf(com_strings2, unique_terms2)
tfidf_list3 = tfidf(com_strings3, unique_terms3)
tfidf_list4 = tfidf(com_strings4, unique_terms4)
tfidf_list5 = tfidf(com_strings5, unique_terms5)
tfidf_list6 = tfidf(com_strings6, unique_terms6)
tfidf_list7 = tfidf(com_strings7, unique_terms7)


In [None]:
tfidf_wikilist1 = tfidf(com_wikistrings1, unique_wikiterms1)

## Wordclouds for communities with book text

In [None]:
"""
Input: A list with the words and their TF-IDF values for each community
Output: A wordcloud plot for each community with their corresponding words
"""

def wordCloud(tfidf_list):
    fig = plt.figure()
    plt.rcParams['figure.figsize'] = [15, 20]

    for i in range(len(tfidf_list)):
        ax = fig.add_subplot(5,2,i+1)
        wordcloud = WordCloud(background_color='white', width=2200,
                          height=1800, collocations=False).generate_from_frequencies(dict(tfidf_list[i]))

        ax.imshow(wordcloud)
        ax.axis('off')
    plt.show()

In [None]:
wordCloud(tfidf_list1)

In [None]:
wordCloud(tfidf_list2)

In [None]:
wordCloud(tfidf_list3)

In [None]:
wordCloud(tfidf_list4)

In [None]:
wordCloud(tfidf_list5)

In [None]:
wordCloud(tfidf_list6)

In [None]:
wordCloud(tfidf_list7)

## Wordclouds for communities with wiki text

In [None]:
wordCloud(tfidf_wikilist1)