# Import data

In [None]:
file = open('royalData.txt', 'r')
royal_data = file.readlines() 
print(royal_data)
file.close()

# Clean and pre-process data

### Tokenize
Comment out the noted line to include stop words the second time around

In [None]:
# replace with all lowercase characters and remove sentence stops
for i in range(len(royal_data)):
    royal_data[i] = royal_data[i].lower().replace('\n', '')

# tokenize and remove stop words
stopwords = ['the', 'is', 'will', 'be', 'a', 'only', 'can', 'their', 'now', 'and', 'at', 'it','in']
filtered_data = []
for sent in royal_data:
    temp = []
    for word in sent.split():
        if word not in stopwords:  #COMMENT OUT THIS LINE TO INCLUDE STOP WORDS
            temp.append(word)
    filtered_data.append(temp)
print(filtered_data)

### Create unique word dictionary

In [None]:
all_words = []
for fd in filtered_data:
    all_words.extend(fd) #get all words in one list
all_words = list(set(all_words)) #turn list into a set to get unique words and then back into a list
all_words.sort()

# turn unique word list into a dictionary
words_dict = {}
counter = 0
for word in all_words:
    words_dict[word] = counter
    counter += 1

print(words_dict)

# Creating our co-occurrence "matrix"
Create a list of all possible word combinations in a sentence and their opposite.

In [None]:
bigrams = []
for words_list in filtered_data:
    for i in range(len(words_list) - 1):
        for j in range(i+1, len(words_list)):
            bigrams.append([words_list[i], words_list[j]])
            bigrams.append([words_list[j], words_list[i]])

print(bigrams)

**DON'T RUN THIS NEXT CELL THE FIRST TIME THROUGH**

Alternative method that create our co-occurence matrix with a time window = 1 (one word before and one word after)

In [None]:
bigrams = []
for words_list in filtered_data:
    for i in range(len(words_list) - 1):
        bigrams.append([words_list[i], words_list[i+1]])
        bigrams.append([words_list[i+1], words_list[i]])

print(bigrams)

# Performing one-hot encoding
Create an identity matrix with the unique words on the rows and columns and a '1' in the diagonal

In [None]:
import numpy as np

onehot_data = np.zeros((len(all_words), len(all_words))) #initialize array
for i in range(len(all_words)): #create identity matrix
    onehot_data[i][i] = 1

# map words to the vectors
onehot_dict = {}
counter = 0
for word in all_words:
    onehot_dict[word] = onehot_data[counter]
    counter += 1

for word in onehot_dict:
    print(word, ":", onehot_dict[word])


Match one-hot encoded input vectors and output labels for training the model

Uses the co-occurrence matrix and the one-hot vectors

In [None]:
X = []
Y = []

for bi in bigrams:
    X.append(onehot_dict[bi[0]]) #first word in each bigram
    Y.append(onehot_dict[bi[1]]) #second word in each bigram
# turn into an array
X = np.array(X)
Y = np.array(Y)

print(X)
print(Y)

# Model

### Define model
We will demonstrate our first model using a hidden layer of size 2

This means that the resulting word vector embedding will be of length 2

This will make visualization easier

In [None]:
import setuptools
from keras.models import Sequential
from keras.layers import Dense, Input


kmodel = Sequential()
vocab_size = len(onehot_data[0])
embed_size = 2 # of parameters

kmodel.add(Input(shape = (vocab_size,)))
kmodel.add(Dense(embed_size))
kmodel.add(Dense(vocab_size, activation='softmax')) #last layer must be softmax activation

kmodel.compile(loss = 'categorical_crossentropy', optimizer = 'adam') #we use this loss because of softmax activation function


### Run model

In [None]:
# DONT RUN ME IN LECTURE

kmodel.fit(X, Y, epochs = 1000, batch_size = 256, verbose = False) #Pass X as the input and Y as the target

### Extract weights from the model

In [None]:
weights = kmodel.get_weights()[0]

word_embeddings = {}
for word in all_words:
    word_embeddings[word] = weights[words_dict[word]] #Match weights to words

for word in all_words:
    print(word, ":", word_embeddings[word]) 

# **STOP HERE AND GO BACK TO SLIDES**

# Plot Embeddings

In [None]:
import matplotlib.pyplot as plt

for word in list(words_dict.keys()):
    coord = word_embeddings.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))

plt.savefig('img.jpg')

### Rerun using +/- 1 word only

### Rerun with stop words