In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
import copy as cp
import sys
from tqdm import tqdm


%load_ext blackcellmagic

Using TensorFlow backend.


In [2]:
def load_word_list(path):
    """
    Loads a list of the words from the file at path <path>, removing all
    non-alpha-numeric characters from the file.
    """
    with open(path) as handle:
        # Load a list of whitespace-delimited words from the specified file
        raw_text = handle.read().strip().split()
        # Strip non-alphanumeric characters from each word
        alphanumeric_words = map(
            lambda word: "".join(char for char in word if char.isalnum()), raw_text
        )
        # Filter out words that are now empty (e.g. strings that only contained non-alphanumeric chars)
        alphanumeric_words = filter(lambda word: len(word) > 0, alphanumeric_words)
        # Convert each word to lowercase and return the result
        return list(map(lambda word: word.lower(), alphanumeric_words))

In [3]:
def generate_onehot_word_dict(word_list):
    """
    Takes a list of the words in a text file, returning a dictionary mapping
    words to their index in a one-hot-encoded representation of the words.
    """

    word_to_index = []
    i = 0
    for word in word_list:
        if word not in word_to_index:
            word_to_index.append(word)
            i += 1

    vect_length = len(word_to_index)
    one_hot_dict = {}

    for one_hot, word in enumerate(word_to_index):
        vector = np.zeros(vect_length)
        vector[one_hot] = 1
        one_hot_dict[word] = vector

    return one_hot_dict

In [4]:
word_list = load_word_list("shakespeare.txt")

for i in range(1, 155):
    word_list.remove("{}".format(i))

In [5]:
def generate_onehot_char_dict(word_list, padding = 2):
    """
    Takes a single word and converts it into a one-hot matrix.
       Every row in the matrix corresponds to a character position.
       Every column corresponds to the character in alphabetical order.
       
       The word list is used to find the matrix size. 
       We use two zero-rows as padding at each end.
       
       Output is a dictionnary corresponding to the one-hot matrix
       of each word. 
    """
    # Create ordered alphabetic list
    alphabet = []
    for i in range(0, 26):
        alphabet.append(chr(ord("a") + i))

    # Find word of largest character length in list
    largest = 0
    for i in word_list:
        length = len(i)
        if length > largest:
            largest = length

    # Create zero matrix + padding
    matrix_origin = np.zeros((largest + (padding * 2), len(alphabet)))

    # Initialize dictionary
    one_hot_dict = {}

    # Grab words in the list in order
    for word in word_list:
        matrix_copy = cp.copy(matrix_origin)

        # Go through each character of that word
        for char in range(len(word)):

            # Compare with index in alphabet
            for index in range(len(alphabet)):

                # Compare letters
                if alphabet[index] == word[char]:

                    # Start writing two rows in (padding)
                    matrix_copy[padding + char, index] = 1

        one_hot_dict[word] = matrix_copy

    return one_hot_dict

In [21]:
def one_hot_decoder(word, word_list, char_based = False):
    """
    Takes a one-hot encoded word as an argument.
    Output:
    String
    """
    
    if char_based:
        # First remove the padding
        word = word[~np.all(word == 0, axis=1)]

        # Create ordered alphabetic list
        alphabet = []
        for i in range(0, 26):
            alphabet.append(chr(ord("a") + i))

        output = ""

        for char in word:
            for pos in range(len(char)):
                if char[pos] == 1:
                    output = output + alphabet[pos]
                    
    else:
        word_to_index = []
        
        for words in word_list:
            if words not in word_to_index:
                word_to_index.append(words)
        
        for i in range(len(word)):
            if word[i] == 1:
                output = word_to_index[i]

                return output
            
    return output

In [7]:
one_hot_dict = generate_onehot_char_dict(word_list)

In [8]:
# Adapted from HW5-3

def generate_traindata(word_list, window_size=4, char_based = False):
    """
    Generates training data for Skipgram model.

    Arguments:
        word_list:     Sequential list of words (strings).
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        window_size:   Size of Skipgram window.
                       (use the default value when running your code).

    Returns:
        (trainX, trainY):     A pair of matrices (trainX, trainY) containing training 
                              points (one-hot-encoded vectors representing individual words) and 
                              their corresponding labels (also one-hot-encoded vectors representing words).

                              For each index i, trainX[i] should correspond to a word in
                              <word_list>, and trainY[i] should correspond to one of the words within
                              a window of size <window_size> of trainX[i].
    """
    if char_based:
        one_hot = generate_onehot_char_dict(word_list, padding = 0)
    else:
        one_hot = generate_onehot_word_dict(word_list)
        one_hot_forward = one_hot
    
    trainX = []
    trainY = []

    for index in range(len(word_list)):
        
 
        word = word_list[index]
        matrix = one_hot_forward[word]
        
        for i in [x for x in range(-window_size, window_size + 1) if x != 0]:

            if (index + i) >= 0 and (index + i) < len(word_list):

                onehot_x = matrix
                onehot_y = one_hot_forward[word_list[index + i]]

                trainX.append(onehot_x)
                trainY.append(onehot_y)

    return (np.array(trainX), np.array(trainY)), one_hot_forward

In [35]:
def train_embedding(word_list, num_latent_factors=10, char_based = False):
    """ 
    Train a word2vec type embedding of the training data.
        Data innput format: List
        
    Uses a dense, two layer shallow neural net to embed.  
    Arguments:
        word_list, constitutes training data
        num_latent_factors, number of latent factors
        
    """
    train_data = generate_traindata(word_list, char_based = False)

    trainX, trainY = train_data[0][0], train_data[0][1]

    one_hot_dict = train_data[1]
    

#     input_dimension = 26
    input_dimension = 3176
    input_shape = (len(trainX[0]), 26)

    model = Sequential()
    
#     model.add(Dense(num_latent_factors, input_shape=input_shape))
#     model.add(Dense(21))
#     model.add(Activation("softmax"))   

#     model.add(Dense(input_dimension, input_shape=input_shape)), activation="relu"))
#     model.add(Dense(num_latent_factors)), activation="relu"))
#     model.add(Dense(input_dimension))
#     model.add(Activation("softmax"))

    model.add(Dense(num_latent_factors, input_dim=input_dimension))
    model.add(Dense(3176))
    model.add(Activation("softmax"))   

    model.compile(
        loss="categorical_crossentropy", optimizer="Adam", metrics=["accuracy"]
    )

    print(model.summary())
    print("\n")

    model.fit(trainX, trainY, batch_size=30, epochs=5)

    # Get model weights up to the second layer
    weights_layer_1 = model.get_weights()[0]
    weights_layer_2 = model.get_weights()[1]

    print("\n")

    for weight in model.get_weights():
        print("Weight Shape:", weight.shape)

    # Sanity check, predict words from 30 random words in the training set
    #random_words = np.random.choice(trainX,30)
    model_output = model.predict(trainX[:50])
    
    if char_based:
        input_word_list = []
        output_word_list = []

        for i in model_output:
            for j in range(len(i)):
                index = np.argmax(i[j])
                i[j] = np.zeros(len(i[j]))
                i[j][index] = 1

            output_word_list.append(one_hot_decoder(i, word_list, char_based = True))

        for word in random_words:
            input_word_list.append(one_hot_decoder(word, word_list, char_based = True))


    else:
        output_word_list = []
        input_word_list = []
        
        for i in model_output:
            index = np.argmax(i)
            i = np.zeros(len(i))
            i[index] = 1
            output_word_list.append(one_hot_decoder(i, word_list))
        
        for word in trainX[:50]:
            input_word_list.append(one_hot_decoder(word, word_list))
     

    print("\n")
    print("Most likely next word: ")

    for k in range(len(input_word_list)):
        print("\n")
        print(input_word_list[k], ", ", output_word_list[k])

    return weights_layer_1, weights_layer_2

In [None]:
test = train_embedding(word_list, num_latent_factors = 10, char_based = False)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 10)                31770     
_________________________________________________________________
dense_14 (Dense)             (None, 3176)              34936     
_________________________________________________________________
activation_7 (Activation)    (None, 3176)              0         
Total params: 66,706
Trainable params: 66,706
Non-trainable params: 0
_________________________________________________________________
None


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5