## Project aimed at creating a character language model based on Goethe's Wilhelm Meister

The model will take variable length input, with the belief that it would be beneficial for the model to learn different ranges of dependencies. The max length of the character input will be 30 characters, with the minimum being 5 characters. The length of the actual input will be 30, with zero-padding being used along with a tf dynamic rnn to make this work.

In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import latex
from sklearn.utils import shuffle
np.random.seed(10)
from random import randint

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
# I want to see all the unique characters and their respective frequencies
char_dict = {}

In [3]:
with open("../../../data/goethe/wilhelm_meister.txt") as file:
    for line in file.readlines():
        for char in line: # accessing all chars, including \n
            if char not in char_dict:
                char_dict[char] = 1
            else:
                char_dict[char] += 1

In [4]:
char_dict.keys()

dict_keys(['B', 'o', 'k', ' ', 'I', 'C', 'h', 'a', 'p', 't', 'e', 'r', '\n', 'T', 'H', 'E', 'P', 'L', 'A', 'Y', 'w', 's', 'l', 'i', 'n', 'b', 'g', 'u', ':', 'd', 'm', 'c', ',', 'f', '.', 'S', 'M', 'y', '’', 'N', 'v', ';', '-', 'x', 'O', 'q', '!', '“', 'W', '?', '”', 'j', 'z', 'V', 'J', 'G', 'D', 'F', '‘', 'K', '—', 'U', 'Q', 'R', 'X', '̈', '́', '(', ')', '6', 'æ', '7', '̂', '̀', '8', 'Z', '/', 'œ', '"'])

In [5]:
# Making an encoder and decoder for all of the characters in the novel
keys = ['B', 'o', 'k', ' ', 'I', 'C', 'h', 'a', 'p', 't', 'e', 'r', '\n', 
        'T', 'H', 'E', 'P', 'L', 'A', 'Y', 'w', 's', 'l', 'i', 'n', 'b', 
        'g', 'u', ':', 'd', 'm', 'c', ',', 'f', '.', 'S', 'M', 'y', '’', 
        'N', 'v', ';', '-', 'x', 'O', 'q', '!', '“', 'W', '?', '”', 'j', 
        'z', 'V', 'J', 'G', 'D', 'F', '‘', 'K', '—', 'U', 'Q', 'R', 'X', 'Z']

encoder = {}
decoder = {}
key_no = 1 # the int representing a key
for key in keys:
    encoder[key] = key_no
    decoder[key_no] = key
    key_no += 1
    
print("Number of unqiue chars in the novel:",len(keys))

Number of unqiue chars in the novel: 66


In [6]:
# grabbing all of the characters in a simple list 
# this was done after the initial file open b/c not all chars will be used based on frequency
char_list = []

with open("../../../data/goethe/wilhelm_meister.txt") as file:
    for line in file.readlines():
        for char in line: # accessing all chars, including \n
            if encoder.get(char) != None: # valid character
                char_list.append(char)
                
print("Total number of chars:",len(char_list))

Total number of chars: 1202452


In [7]:
# returns one hot encoding for a particular character
def get_one_hot_encoding(char):
    char_enc = encoder[char]
    hot_vec = np.zeros((67,1)) # vocab_size = 66 (indexed at 1 so need 67 spots)
    hot_vec[char_enc] = 1
    hot_vec = hot_vec.T # shape (1,67)
    return hot_vec

In [8]:
# Loops and creates data until it reaches the end of all of the characters
def create_data(char_list):
    X_data = []
    y_data = []
    lis_len = len(char_list)
    char_i = 0 # keeps track of which char from char_list input we are on
    while True:
        input_len = randint(5,30)
        if (char_i + input_len + 1) > lis_len: # basically reached end of chars
            break

        ax = np.zeros((30,67))
        ay = get_one_hot_encoding(char_list[input_len]) # getting the y label
        for offset in range(0,input_len):
            achar = get_one_hot_encoding(char_list[char_i + offset])
            ax[offset] = achar
        
        ax.shape = (1,30,67)
        ay.shape = (1,67)
        X_data.append(ax)
        y_data.append(ay)
        char_i += 2
    
    return X_data,y_data

In [9]:
X_data,y_data = create_data(char_list)

In [10]:
print(len(X_data)) # the number of training examples

601213


In [11]:
# reshape the data into a numpy array
X_arr = np.array(X_data)
y_arr = np.array(y_data)
print(X_arr.shape)
print(y_arr.shape)

(601213, 1, 30, 67)
(601213, 1, 67)


In [12]:
# save the data
np.save("../../../data/goethe/X_arr.npy",X_arr)
np.save("../../../data/goethe/y_arr.npy",y_arr)