In [None]:
# Function for reading in mutiple files

import re

text_infile = 'text_corpus.txt'

def clean_text(s):
    revals = re.compile('\[[^\]]*\]|[^\sA-Za-z\-\!\.\?/\';:,]')
    rewhitespace = re.compile('\s+')
    stripped = revals.sub('', s)
    stripped = rewhitespace.sub(' ', stripped).lower() # replace whitespace with single space
    return stripped

with open(text_infile) as infile:
    data = infile.read()
    data = clean_text(data)
    infile.close()

In [None]:
# Read in multiple files and consolidate text into a single list
import os

root_dir = 'Gutenberg/txt/' # root 

def get_size(path):
        """Function returns the size of a given file in megabytes."""
        size = os.path.getsize(path)
        return size/1000000


def find_files(rootdir, max_size):
    """Find all files in a given subdirectory and create"""
    filesInTxt = []
    for filename in os.listdir(rootdir):
        fiSize = get_size(rootdir + filename)
        if filename.endswith(".txt") and fiSize < max_size:
            filesInTxt.append('Gutenberg/txt/' + filename)
    return filesInTxt

file_list = find_files('Gutenberg/txt/', 0.1)
print(f"Number of files processed to created corpus {len(file_list)}")

In [None]:
# Implement text preprocessing functions with regex and spacy

import spacy

nlp = spacy.load('en_core_web_sm')

def preprocessing_vectors(corpus):
    """Remove stopwords, punctuation, and then convert text back into string."""
    text = nlp(corpus)
    text = [token.text for token in text if not token.is_stop and not token.is_punct]
    return " ".join(text)

def write_out_corpus(file_list, file_name):
    """Clean all text data, then output it to a txt file."""
    outFile = open(file_name, 'w')
    pattern = re.compile('\[[^\]]*\]|[^\w\s\.\!\?\,\'\(\)\"\/\;\_]')
    whitespace = re.compile('\s+')
    i = 0

    for file in file_list:
        if (i % 50 == 0):
                logging.info("read {0} reviews".format(i))
        with open(file) as infile:
            data = infile.read()
            data = pattern.sub('', data)
            data = whitespace.sub(' ', data).lower()
            data = preprocessing_vectors(data) # remove stopwords & punctuation
            outFile.write(data) # write out to gzip file
            infile.close()
        i = i + 1
    outFile.close()

write_out_corpus(file_list, 'outText.txt')

In [None]:
# Implement SkipGram Model with Python Class

import numpy as np
import string

def softmax(x):
    """Compute e^x/sum(e^x) - softmax values for each score in x"""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

class SkipGram(object):
    def __init__(self): 
        self.N = 20 # Number of neurons in the hidden layer
        self.X_train = [] # space for training data tokens
        self.y_train = [] # list for probability outputs
        self.window = 2 # range to the front and back of center word use
        self.alpha = 0.001 # learning rate
        self.words = [] # list of words relevant to our dat model
        self.word_index = {} # dictionary for word and index pair

    def initialize_model(self, V, data):
        self.V = V
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N)) 
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) 
           
        self.words = data
        for i in range(len(data)): 
            self.word_index[data[i]] = i

    def feed_forward(self, X):
        self.h = np.dot(self.W.T, X).reshape(self.N, 1)
        self.u = np.dot(self.W1.T, self.h)
        self.y = softmax(self.u)
        return self.y

    def backpropagate(self, x, t):
        e = self.y - np.asarray(t).reshape(self.V, 1)
        dLdW1 = np.dot(self.h,e.T) 
        X = np.array(x).reshape(self.V,1) 
        dLdW = np.dot(X, np.dot(self.W1,e).T) 
        self.W1 = self.W1 - self.alpha*dLdW1 
        self.W = self.W - self.alpha*dLdW
    
    def train(self, epochs):
        for x in range(1, epochs):
            self.loss = 0
            for j in range(len(self.X_train)):
                self.feed_forward(self.X_train[j])
                self.backpropagate(self.X_train[j], self.y_train[j])
                
                C = 0
                for m in range(self.V):
                    if (self.y_train[j][m]):
                        self.loss += -1*self.u[m][0] 
                        C += 1
                self.loss += C * np.log(np.sum(np.exp(self.u)))
            if x % 100 == 0:
                print("epoch {} loss = {}".format(x, self.loss))
            self.alpha *= 1/((1+self.alpha*x))
    
    def predict(self, word, num_predictions):
        if word in self.words:
            index = self.word_index[word]
            X = [0 for i in range(self.V)]
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {}
            for i in range(self.V):
                output[prediction[i][0]] = i
            
            top_words = []
            for k in sorted(output, reverse=True):
                top_words.append(self.words[output[k]])
                if (len(top_words) >= num_predictions):
                    break
            return top_words
        else:
            print("Word not found in dictionary.")
            
    
def buildDictionary(sentences, skip_gram):
    data = {}
    # Count all distinct words
    for sentence in sentences:
        for word in sentence: 
            if word not in data:
                data[word] = 1
            else:
                data[word] += 1
    V = len(data)
    # prepare list for one hot encoding
    data = sorted(list(data.keys()))
    vocabulary = {}
    for i in range(len(data)):
        vocabulary[data[i]] = i
    
    for sentence in sentences:
        for i in range(len(sentence)):
            center_word = [0 for x in range(V)]
            center_word[vocabulary[sentence[i]]] = 1
            context = [0 for x in range(V)]
            
            for j in range(i-skip_gram.window, i+skip_gram.window):
                if i != j and j >= 0 and j < len(sentence):
                    context[vocabulary[sentence[j]]] += 1
            skip_gram.X_train.append(center_word)
            skip_gram.y_train.append(context)
    skip_gram.initialize_model(V, data)
    
    return skip_gram.X_train, skip_gram.y_train


epochs = 1000
SkipModel = SkipGram()
pre_processed_text = preprocessing_vectors(test_text)
prepare_data(pre_processed_text, SkipModel)
SkipModel.train(epochs)
print(SkipModel.predict('computing', 3))

In [None]:
# Try gensim implementation of Word2Vec using separate corpus

import gensim
import gzip

data_file="text.txt.gz"

def read_input(input_file):
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 
            # Remove accents and convert to unicode
            yield gensim.utils.simple_preprocess(line) 

documents = list(read_input(data_file))

In [None]:
model = gensim.models.Word2Vec(documents, size=150, window=10, min_count=2, workers=10)
model.train(documents, total_examples=len(documents), epochs=10)

In [None]:
print(f"Synonyms for dirty: \n {model.wv.most_similar(positive="dirty")}")

In [None]:
print(f"Synonyms for happy: \n {model.wv.most_similar(positive="happy")}")

In [None]:
w3 = "hate"
model.wv.most_similar(positive=w3)

In [None]:
# Evaluate above model on wordsim353 dataset
# For more on wordsim353 see: http://alfonseca.org/eng/research/wordsim353.html
import pandas as pd

df = pd.read_csv('/Users/ezramacdonald/Downloads/wordsim353/combined.csv')
df.head()

In [None]:
# Normalize Human mean score
df['Human (mean)'] = (df['Human (mean)'] - df['Human (mean)'].min())/(df['Human (mean)'].max() - df['Human (mean)'].min())

df.head()

In [None]:
# Compute score of similarilty between two words based on our model

def similarity_score(x, y):
    if str(x) not in model.wv.vocab or str(y) not in model.wv.vocab:
        return 0
    return model.wv.similarity(str(x), str(y))

df['Score'] = df.apply(lambda x: similarity_score(x['Word 1'], x['Word 2']), axis=1)

In [None]:
df.head()

In [None]:
true_false = len((df[(df['Score'] <= 0.5) & (df['Human (mean)'] <= 0.5)]))

In [None]:
true_true = len((df[(0.5 <= df['Score']) & (0.5 <= df['Human (mean)'])]))

In [None]:
# Results of model based on training for ~30 minutes on Amazon Fine Food Reviews dataset.
print(f"Model effectiveness based on wordsim353 evaluation: {(true_true+true_false)/len(df)}")