In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM, TimeDistributed  
from time import sleep
import csv
import numpy as np

Using Theano backend.


In [2]:
# change to False if you want to train on datasets 
# where we don't use CUSTOM_NAME token for tagged Facebook names
custom_names = True

if custom_names:
    comments_filepath = "csv_data/CUSTOM_NAMES/custom_name_token_comments.csv"
else:
    comments_filepath = "csv_data/FILTERED_NAMES/filtered_names_comments.csv"

status_filepath = "csv_data/beaverconfessions_facebook_statuses.csv"
        
stop_symbol = "`"

def load_dataset(filepath, dictionary, datatype):
    dt = "{}_message".format(datatype)
    
    with open(filepath, "rU") as csvfile:
        reader = csv.DictReader(csvfile)
        sentences = []
        
        for status in reader:
            if dt not in status:
                # 2 rows are being read at a time for the statuses csv for some reason...
                # handled with monkey patching below
                msg1, msg2 = status.items()[1]

                for char in msg1:
                    dictionary.add(char)
                for char in msg2:
                    dictionary.add(char)
                
                # add stop symbol to end before text->int conversion
                msg1 += stop_symbol
                msg2 += stop_symbol
                sentences.append(msg1)
                sentences.append(msg2)
            else:
                msg = status[dt]
                for char in msg:
                    dictionary.add(char)
                msg += stop_symbol
                sentences.append(msg)
                    
    return dictionary, sentences
    
dictionary, comments = load_dataset(comments_filepath, set([]), "comment")
dictionary, statuses = load_dataset(status_filepath, dictionary, "status")
sentences = []
sentences.extend(comments)
sentences.extend(statuses)
sentences_arr = np.array(sentences)

# shuffle statuses and comments
random_permutation = np.random.permutation(len(sentences))
sentences_arr = sentences_arr[random_permutation]

print 'Sample sentences:'
print '-', sentences_arr[0]
print '-', sentences_arr[1]
print

# create vocabulary of characters found in data
chars = sorted(list(dictionary))
padding_symbol = "{"
chars.insert(0, stop_symbol) # index 1
chars.insert(0, padding_symbol) # index 0 
# print chars

# print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
nb_classes = len(char_indices)
print char_indices


# TODO separate model_related values and preprocessing code
maxlen = 280
window_size = 10 # TODO try smaller and bigger window sizes
step = 1 # char by char

sub_sentences = []
next_chars = []
for sentence in sentences:
    for i in range(0, maxlen - window_size+1, step):
        sub_sentences.append(sentence[i : i+window_size])
        next_chars.append(sentence[i+1 : i+1+window_size])

nb_samples = len(sub_sentences)

# any values not filled in represent padding 
# extra 1 represents space for the stop symbol
X_labels = np.zeros((nb_samples, maxlen+1))
y_labels = np.zeros((nb_samples, maxlen+1))

for sample_nb in range(nb_samples):
    x_label = map(lambda x: char_indices[x], sub_sentences[sample_nb])
    y_label = map(lambda x: char_indices[x], next_chars[sample_nb])

    X_labels[sample_nb][:len(x_label)] = x_label
    y_labels[sample_nb][:len(y_label)] = y_label

# print X_labels[:1], y_labels[:1]

print('# training samples:', nb_samples)

Sample sentences:
- #1012 The only reason I go to class is to show off my cute outfits.`
- #9157 Best things about the Brass Rat: craftswoman, drake, athena, biotech bridge, ivy leaves and pinecone, and HASS Worst things: ???????? I guess the mobius is kinda lame?? Everything else is literally perfect !!!11!!1!!11!!!1!!!!1`

{' ': 2, '$': 6, '(': 10, ',': 14, '0': 18, '4': 22, '8': 26, '<': 30, '@': 34, 'D': 38, 'H': 42, 'L': 46, 'P': 50, 'T': 54, 'X': 58, '`': 1, 'd': 68, 'h': 72, 'l': 76, 'p': 80, 't': 84, 'x': 88, '|': 91, '#': 5, "'": 9, '+': 13, '/': 17, '3': 21, '7': 25, ';': 29, '?': 33, 'C': 37, 'G': 41, 'K': 45, 'O': 49, 'S': 53, 'W': 57, '[': 61, '_': 64, 'c': 67, 'g': 71, 'k': 75, 'o': 79, 's': 83, 'w': 87, '{': 0, '"': 4, '&': 8, '*': 12, '.': 16, '2': 20, '6': 24, ':': 28, '>': 32, 'B': 36, 'F': 40, 'J': 44, 'N': 48, 'R': 52, 'V': 56, 'Z': 60, '^': 63, 'b': 66, 'f': 70, 'j': 74, 'n': 78, 'r': 82, 'v': 86, 'z': 90, '~': 93, '!': 3, '%': 7, ')': 11, '-': 15, '1': 19, '5': 23

In [3]:
from keras.utils import to_categorical

# make use of subsentences, next_chars, which are our words (semantic, not encoded yet)

def validate_data(X, y):
    # assert at least one nonzero value in each OHE sample
    assert(False not in np.any(X>0, axis=1))
    assert(False not in np.any(y>0, axis=1))

def sampling_generator(epoch_size, batch_size, validation=False, sample_weights=False):  
    """
    Takes in labels of int values, e.g. [1. 2. 5. 9. 1. 0. 0. ...]
    These labels are already padded with 0s to a predetermined maxlen.
    We convert each sample X and y to one hot versions, and create batches.
    Batch generation is necessary to avoid MemoryError.
    """
    
    if validation:
        X_labels = X_val_labels 
        y_labels = y_val_labels 
    else:
        X_labels = X_train_labels
        y_labels = y_train_labels
    
    while True:
        # hand off data in batches
        for i in range(int(epoch_size/batch_size)):
            start = i * batch_size
            end = min(start + batch_size, epoch_size)
            true_batch_size = end - start
            
            # fresh batch
            sample_X = []
            sample_Y = []

            # one hot encode
            for i in range(true_batch_size):                
                ohe_x = to_categorical(X_labels[i], num_classes=nb_classes)
                ohe_y = to_categorical(y_labels[i], num_classes=nb_classes)
                
                # ensure no 0 zeros in data - otherwise training NaNs out
                # validate_data(ohe_x, ohe_y)
                
                # add dim of size 1 to front of np arrays to allow for vstacking
                ohe_x = np.expand_dims(ohe_x, axis=0)
                ohe_y = np.expand_dims(ohe_y, axis=0)
                                
                # build the batch
                if len(sample_X) == 0:
                    sample_X = ohe_x
                    sample_Y = ohe_y
                else:
                    sample_X = np.vstack((sample_X, ohe_x))
                    sample_Y = np.vstack((sample_Y, ohe_y))
                        
            if sample_weights:
                # TODO fill in with reactions if so desired :)
                sample_weights = []
                yield (sample_x, sample_y, sample_weights)
            
            yield (sample_X, sample_Y)

# print next(sampling_generator())          

In [4]:
# build the model: 2 stacked LSTM
print('Building model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen+1, nb_classes)))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(nb_classes)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop') # TODO switch to Adam for more stability?
print ('Model is made!')

Building model...
Model is made!


In [5]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 281, 512)          1243136   
_________________________________________________________________
lstm_2 (LSTM)                (None, 281, 512)          2099200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 281, 512)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 281, 94)           48222     
_________________________________________________________________
activation_1 (Activation)    (None, 281, 94)           0         
Total params: 3,390,558
Trainable params: 3,390,558
Non-trainable params: 0
_________________________________________________________________


In [7]:
from keras.callbacks import Callback, EarlyStopping, CSVLogger, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import train_test_split
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt

# split into train and val data
X_train_labels, X_val_labels, y_train_labels, y_val_labels = train_test_split(X_labels, y_labels, test_size=0.3)
nb_train_samples = 300 # len(X_train_labels)
nb_val_samples = 300 # len(X_val_labels)
batch_size = 100 # 128

class PlotHistory(Callback):
    def __init__(self, run_name):
        self.run_name = run_name 

    def on_train_begin(self, logs=None):
        self.epoch = []
        self.history = {}

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        self.epoch.append(epoch)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

        # losses
        loss_handles = []
        for key in self.history:
            l, = plt.plot(self.history[key], label=key)
            loss_handles.append(l)

        plt.title('losses and metrics for {}'.format(self.run_name))    
        plt.ylabel('loss')
        plt.yscale('symlog')
        plt.legend(handles=loss_handles, fontsize=6)          

        # make subplots close to each other and hide x ticks for all but bottom plot
        plt.savefig('{}_plot.jpg'.format(self.run_name))        
        plt.clf()

# callbacks
model_name = 'lstm_512_simple'
checkpointer = ModelCheckpoint(filepath='{}_weights.hdf5'.format(model_name), verbose=1, save_best_only=True)
csv_logger = CSVLogger('{}.log'.format(model_name))
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
plot_history = PlotHistory(model_name)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5,
                                              verbose=1, epsilon=1e-4, min_lr=1E-6) 

# train
history = model.fit_generator(
    sampling_generator(nb_train_samples, batch_size), 
    steps_per_epoch=nb_train_samples/batch_size,
    epochs=2, #10,
    verbose=1,
    validation_data=sampling_generator(nb_val_samples, batch_size, validation=True),
    validation_steps=nb_val_samples/batch_size,
    callbacks=[early_stopping, reduce_lr, csv_logger, checkpointer, plot_history])
    
sleep(0.1) # https://github.com/fchollet/keras/issues/2110 

hist = history.history
print 'Loss and val_loss is', hist['loss'][0], hist['val_loss'][0]
print history

Epoch 1/2

KeyboardInterrupt: 

In [None]:
seed_string="#9 I"
original_len = len(seed_string)
seed_string = pad_sentence(seed_string, stop=False)
print ("seed string -->", seed_string), len(seed_string)
print ('The generated text is')
prediction = ""
for i in range(maxlen):
    x=np.zeros((1, len(seed_string), len(chars)))
    for t, char in enumerate(seed_string):
        x[0, t, char_indices[char]] = 1.
    preds = model.predict(x, verbose=0)[0]
    print preds
    next_index=np.argmax(preds[len(seed_string)-1]) # last char of window
    print next_index 
    next_char = indices_char[next_index]
    if original_len+1 < len(seed_string):
        seed_string = seed_string[:original_len] + next_char + seed_string[original_len+1:]
    else:
        seed_string = seed_string[:original_len] + next_char
    original_len += 1
    print seed_string