In [1]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Grab the UTF-8 plain text for The Tempest from the Gutenberg Project
html_doc = requests.get("https://www.gutenberg.org/files/23042/23042-h/23042-h.htm").text

# Feed into BeautifuLSoup to get a clean ASCII string
soup = BeautifulSoup(html_doc, 'html.parser')
plain_text = soup.get_text()

# Tokenize into lowercase words
words = word_tokenize(plain_text.lower())

# Create a list of stop words
stop_words = stopwords.words("english") + list(punctuation)

# Remove all unwanted words
completewords = [w for w in words if w not in stop_words]

# Remove all lowercase words
#completeupper = [w for w in completewords if w[0].isupper()]
#completeupper = [w for w in words if w[0].isupper()]

# Find the frequency of all words
freq = FreqDist(completewords)

# Turn this list into a pandas dataframe
wordlist = []
wordfreq = []
for i in sorted(freq, key=freq.get, reverse=True):
    wordlist.append(i)
    wordfreq.append(freq[i])
    
df = pd.DataFrame({'wordlist':wordlist, 'wordfreq':wordfreq})

In [3]:
for i in range(50):
    print(df["wordlist"][i])

thou
pope
pros
thee
f4
thy
f1
f3
f2
project
...
sir
conj
seb
1
ff
ant
shall
iâll
ste
work
would
mir
gutenberg-tm
ari
2
gon
cal
good
come
ariel
rowe
upon
like
scene
enter
one
mine
capell
make
trin
hanmer
note
ii
theobald
alon
us
works
must
monster


In [4]:
words = ["thou", "pope", "thee", "thy", "sir", "shall", "work", "would", "good", "come", "upon", "like", "one", "mine", "make", "us", 
         "works", "must", "monster", ]

In [5]:
import networkx as nx
G = nx.Graph()
G.add_nodes_from(words)
sentences = sent_tokenize(plain_text.lower())
for sentence in sentences:
    word_dict = {}
    for word in words:
        word_dict[word] = 0
    frag = word_tokenize(sentence)
    for word in words:
        if word in frag:
            word_dict[word] = word_dict[word] + 1
    for w1 in words:
        if word_dict[w1] > 0:
            for w2 in words:
                if word_dict[w2] > 0 and w1 != w2:
                    G.add_edge(w1, w2)

In [6]:
G.nodes()

NodeView(('thou', 'pope', 'thee', 'thy', 'sir', 'shall', 'work', 'would', 'good', 'come', 'upon', 'like', 'one', 'mine', 'make', 'us', 'works', 'must', 'monster'))

In [7]:
G.edges()

EdgeView([('thou', 'thy'), ('thou', 'good'), ('thou', 'thee'), ('thou', 'one'), ('thou', 'mine'), ('thou', 'must'), ('thou', 'upon'), ('thou', 'would'), ('thou', 'make'), ('thou', 'come'), ('thou', 'shall'), ('thou', 'work'), ('thou', 'like'), ('thou', 'works'), ('thou', 'monster'), ('thou', 'us'), ('thou', 'sir'), ('thou', 'pope'), ('pope', 'upon'), ('pope', 'mine'), ('pope', 'sir'), ('pope', 'one'), ('pope', 'thee'), ('pope', 'would'), ('pope', 'make'), ('pope', 'shall'), ('thee', 'thy'), ('thee', 'one'), ('thee', 'mine'), ('thee', 'come'), ('thee', 'upon'), ('thee', 'make'), ('thee', 'shall'), ('thee', 'work'), ('thee', 'like'), ('thee', 'must'), ('thee', 'works'), ('thee', 'would'), ('thee', 'monster'), ('thee', 'good'), ('thee', 'sir'), ('thy', 'one'), ('thy', 'mine'), ('thy', 'come'), ('thy', 'work'), ('thy', 'upon'), ('thy', 'make'), ('thy', 'shall'), ('thy', 'like'), ('thy', 'good'), ('thy', 'works'), ('thy', 'monster'), ('thy', 'must'), ('thy', 'sir'), ('thy', 'would'), ('thy'

In [8]:
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, LabelSet
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
output_notebook()

In [9]:
plot = figure()

# layouts: https://networkx.org/documentation/stable/reference/drawing.html#module-networkx.drawing.layout
network_graph = from_networkx(G, nx.spring_layout)

plot.renderers.append(network_graph)

show(plot)

# Meaning of the graph
First I calculated the word frequency of each word in The Tempest by Shakespeare. Then I took the first 19 words (not including weird words or words that I are meant solely for plays such as abbreviated character names, notes, etc) and I looked through each sentence. Every sentence that a word was used in conjunction with another word, I added a edge between the two nodes. That's how I created this graph. I hope that this graph illuminates what kinds of words are used on their own the most and hopefully it shows how significant those words are to the body of work.

In [10]:
nx.density(G)

0.7719298245614035

In [11]:
nx.degree_centrality(G)

{'thou': 1.0,
 'pope': 0.5,
 'thee': 0.9444444444444444,
 'thy': 0.9444444444444444,
 'sir': 0.7777777777777777,
 'shall': 0.7777777777777777,
 'work': 0.611111111111111,
 'would': 0.8888888888888888,
 'good': 0.7222222222222222,
 'come': 0.5,
 'upon': 1.0,
 'like': 0.7777777777777777,
 'one': 0.8888888888888888,
 'mine': 0.8333333333333333,
 'make': 0.9444444444444444,
 'us': 0.7222222222222222,
 'works': 0.4444444444444444,
 'must': 0.7777777777777777,
 'monster': 0.611111111111111}

# Network density and most important nodes
The network density is about 0.772 and that shows that many of the words in this network graph have been used in sentences together. I think the most important nodes are the nodes with the least degree centrality. These words would be pope, monster, works, or come. Similar to how the use of "the" in Macbeth make is scary, I feel like if we analyzed the way these words are used in The Tempest, it could say a lot about the play itself. Here are the values of degree centrality for these nodes:  
'pope': 0.5, 'come': 0.5, 'monster': 0.611111111111111,  'works': 0.4444444444444444

In [19]:
# Making the graph more aesthetically pleasing
dg = dict(nx.degree(G))
nx.set_node_attributes(G, name='degree', values=dg)

# Trying to make the important words stand out more
importance = {'thou': 'black',
 'pope': 'red',
 'thee': 'black',
 'thy': 'black',
 'sir': 'black',
 'shall': 'black',
 'work': 'black',
 'would': 'black',
 'good': 'black',
 'come': 'red',
 'upon': 'black',
 'like': 'black',
 'one': 'black',
 'mine': 'black',
 'make': 'black',
 'us': 'black',
 'works': 'red',
 'must': 'black',
 'monster': 'red'}
nx.set_node_attributes(G, name='importance',values=importance)

# Add tooltips
HOVER_TOOLTIPS = [
        ("Name", "@index"),
        ("Degree", "@degree"),
]

plot = figure(tooltips = HOVER_TOOLTIPS,
              x_range=Range1d(-11, 11),
              y_range=Range1d(-11, 11))

network_graph = from_networkx(G, nx.spring_layout, scale=10, center=(0, 0))

# Give each node a circle
network_graph.node_renderer.glyph = Circle(size=15,
                                           fill_color='importance')

plot.renderers.append(network_graph)

x, y = zip(*network_graph.layout_provider.graph_layout.values())
node_labels = list(G.nodes())
source = ColumnDataSource({'x': x, 'y': y, 'name': [str(node_labels[i]) for i in range(len(x))]})
labels = LabelSet(x='x', y='y', text='name', source=source, background_fill_color='white', text_font_size='14px', background_fill_alpha=1, x_offset=-2.0, y_offset=-2.0)
plot.renderers.append(labels)

show(plot)
title = 'TheTempest'
save(plot, filename=f"{title}.html")

'/home/jovyan/dh140/TheTempest.html'