In [11]:
# Save the books

import requests

# Get a request and save it as a txt file

books = {
    "863": "The Mysterious Affair at Styles",
    "1155": "The Secret Adversary",
    "58866": "The Murder on the Links",
    "61168": "The Man in the Brown Suit",
    "61262": "Poirot Investigates",
    "65238": "The Secret of Chimneys",
    "66446": "The Plymouth Express Affair",
    "67160": "The Hunter's Lodge Case",
    "67173": "The Missing Will",
    "69087": "The Murder of Roger Ackroyd",
    "70114": "The Big Four",
    "72824": "The Mystery of the Blue Train"
}

# Go through the dict

for id, title in books.items():
    res = requests.get(f'http://www.gutenberg.org/ebooks/{id}.txt.utf-8')
    # Write to txt file

    cleaned_text = res.text.encode('utf-8', errors='replace').decode('utf-8')
    
    filename = "Data/" + title.replace(" ", "-") + ".txt"
    # with open(filename, 'w', encoding='utf-8') as file:
    #     # Clean res.text to have nothing that cannot be encoded in a .write() call
    #     file.write(cleaned_text)
    #     pass

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\willw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\willw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\willw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
import nltk
from nltk.tokenize import sent_tokenize

# Manually load the Punkt tokenizer

# Make sure to download resources in separate file outside of Jupyter notebook
nltk.download('punkt_tab') #- pre-trained model for tokenization (sent_tokenize)
nltk.download('stopwords') # removes common filler words like "the" "and" "is"
nltk.download('wordnet') #- database that groups words into sets of synoynms - used for lemmatization


# Test sentence tokenization
text = "This is a test sentence. Let's see if it works."
print(sent_tokenize(text))

['This is a test sentence.', "Let's see if it works."]


In [7]:
# Python program to generate word vectors using Word2Vec

# importing all necessary modules
from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action='ignore')


# Reads file
sample = open("Data/books/The-Mysterious-Affair-at-Styles.txt", encoding="utf-8")
s = sample.read()

# Replaces escape character with space
f = s.replace("\n", " ")

data = []

# iterate through each sentence in the file
for i in sent_tokenize(f):
	temp = []

	# tokenize and normalize the sentence into words
	for j in word_tokenize(i):
		temp.append(j.lower())

	data.append(temp)

# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count=1,
								vector_size=100, window=5)

# Print results
print("Cosine similarity between 'murder' " +
	"and 'investigate' - CBOW : ",
	model1.wv.similarity('murder', 'investigate'))

print("Cosine similarity between 'murder' " +
	"and 'said' - CBOW : ",
	model1.wv.similarity('murder', 'said'))

# Create Skip Gram model
model2 = gensim.models.Word2Vec(data, min_count=1, vector_size=100,
								window=5, sg=1)

# Print results
print("Cosine similarity between 'murder' " +
	"and 'investigate' - Skip Gram : ",
	model2.wv.similarity('murder', 'investigate'))

print("Cosine similarity between 'murder' " +
	"and 'said' - Skip Gram : ",
	model2.wv.similarity('murder', 'said'))


Cosine similarity between 'murder' and 'investigate' - CBOW :  0.9276937
Cosine similarity between 'murder' and 'said' - CBOW :  0.9763936
Cosine similarity between 'murder' and 'investigate' - Skip Gram :  0.93102574
Cosine similarity between 'murder' and 'said' - Skip Gram :  0.8678242


In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, MWETokenizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import re
from gensim.models import Word2Vec


# Reads file
Styles_file = open("Data/books/The-Mysterious-Affair-at-Styles.txt", encoding="utf-8")
Styles = Styles_file.read()

# Clean Text - Normalization - Lowercase, Remove Punctuation
Styles = Styles.lower()
#Styles = re.sub(r'[^\w\s]', '', Styles)

# Tokenize
sentences = sent_tokenize(Styles)
tokens = [word_tokenize(sentence) for sentence in sentences]

# Remove Stopwords and non alphanumeric characters
stop_words = set(stopwords.words('english'))
# tokens = [[word for word in sentence if word not in stop_words] for sentence in tokens]
tokens = [[word for word in sentence if word.isalnum() and word not in stop_words] for sentence in tokens]

#  Lemmatization (Optional)
#lemmatizer = WordNetLemmatizer()
# Part of Speech = Verb
#tokens = [[lemmatizer.lemmatize(word, pos='v') for word in sentence] for sentence in tokens]

model = Word2Vec(tokens, vector_size=100, window=5, min_count=1, sg=0) #CBOW model


words = list(model.wv.index_to_key) # Gets words from models vocabularly                             
w_vec = np.array([model.wv[word] for word in words]) # Word Vectors


# Training

model.build_vocab(tokens)
model.train(tokens, total_examples=model.corpus_count, epochs=5)


# Save Word Embedding
np.savetxt('./projector_files/trained0_embedding.tsv', w_vec, delimiter='\t')

# All Words in the novel saved
#with open('./projector_files/init_words_metadata.tsv', 'w') as f:
 #   for word in words:
  #      f.write(word + "\n")
        

## TRAINING 
#model.build_vocab(tokens)
#model.train(tokens, total_examples=model.corpus_count, epochs=5)


similar_words = model.wv.most_similar('murder', topn=5)
similar_words1 = model.wv.most_similar('poison', topn=5)
poison = model.wv.most_similar('strychnine',topn=5)

print(f"Words similar to murder: {similar_words}")
print(f"Words similar to poison: {similar_words1}")
print(f"Words similar to styrchnine: {poison}")



## Steps for visualizing
# 1. Install tensorflow - pip install tensorflow
# 2. Run Tensorboard - tensorboard --logdir=./projector_files/







[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\willw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\willw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\willw\AppData\Roaming\nltk_data...


Words similar to murder: [('last', 0.9990029335021973), ('strychnine', 0.9990007281303406), ('would', 0.9989446997642517), ('cynthia', 0.9989364743232727), ('little', 0.9989124536514282)]
Words similar to poison: [('eyes', 0.9992889761924744), ('john', 0.9992706775665283), ('take', 0.9992212653160095), ('one', 0.9992113709449768), ('time', 0.9991839528083801)]
Words similar to styrchnine: [('inglethorp', 0.9997853636741638), ('cavendish', 0.9997647404670715), ('project', 0.999763011932373), ('poirot', 0.9997619390487671), ('quite', 0.9997591972351074)]
