In [7]:
### MYSTERIOUS AFFAIR AT STYLES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, MWETokenizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import re
from gensim.models import Word2Vec
import os


# Function for preprocessing: Normalization & Tokenization, Stopword removal, OPTIONALLY investigate Lemmatization 
def preprocess(file_path):
    with open(file_path, encoding='utf-8') as f:
        text = f.read().lower() # Lowercase

    # Tokenization
    sentences = sent_tokenize(text)
    tokens = [word_tokenize(sentence) for sentence in sentences]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [[word for word in sentence if word.isalnum() and word not in stop_words] for sentence in tokens]

    #lemmatizer = WordNetLemmatizer()
    # Part of Speech = Verb
    #tokens = [[lemmatizer.lemmatize(word, pos='v') for word in sentence] for sentence in tokens]


    return tokens


# Normalize and tokenize all 12 novels then combine them into 1
# 3 selected
styles_tokens = preprocess("./Data/books/The-Mysterious-Affair-at-Styles.txt")
# links_tokens = preprocess("./Data/books/The-Murder-on-the-Links.txt")
# ackroyd_tokens = preprocess("./Data/books/The-Murder-of-Roger-Ackroyd.txt")

# investigates_tokens = preprocess("./Data/books/Poirot-Investigates.txt")
# bigfour_tokens = preprocess("./Data/books/The-Big-Four.txt")
# hunters_tokens = preprocess("./Data/books/The-Hunter's-Lodge-Case.txt")
# brownsuit_tokens = preprocess("./Data/books/The-Man-in-the-Brown-Suit.txt")
# missingwill_tokens = preprocess("./Data/books/The-Missing-Will.txt")
# bluetrain_tokens = preprocess("./Data/books/The-Mystery-of-the-Blue-Train.txt")
# plymouth_tokens = preprocess("./Data/books/The-Plymouth-Express-Affair.txt")
# secretadversary_tokens = preprocess("./Data/books/The-Secret-Adversary.txt")
# chimneys_tokens = preprocess("./Data/books/The-Secret-of-Chimneys.txt")


novel_tokens = []
# Loop through all novels in the folder
for filename in os.listdir("./Data/books"):
    if filename.endswith(".txt"):  # Process only .txt files
        file_path = "./Data/books/" + filename
        tokens = preprocess(file_path)  
        novel_tokens += tokens  



# All novel tokens combines
#novel_tokens = styles_tokens + links_tokens + ackroyd_tokens
#print(len(novel_tokens))


model = Word2Vec(novel_tokens, vector_size=100, window=5, min_count=1, sg=0) #CBOW model


words = list(model.wv.index_to_key) # Gets words from models vocabularly                             
w_vec = np.array([model.wv[word] for word in words]) # Word Vectors

# Train model on all 3 novels
model.build_vocab(novel_tokens)
model.train(novel_tokens, total_examples=model.corpus_count, epochs=5)


# Then separate our Styles word vector from the rest 
styles_words = set([word for sentence in styles_tokens for word in sentence])
filtered_words = [word for word in styles_words if word in model.wv]
filtered_vectors = np.array([model.wv[word] for word in filtered_words])


# Save Word Embedding
#np.savetxt('projector_files/all_12_styles_wv.tsv', filtered_vectors, delimiter='\t')

# All Words in the novel saved
#with open('./projector_files/init_words_metadata.tsv', 'w') as f:
 #   for word in words:
  #      f.write(word + "\n")

