In [16]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, TimeDistributed, Masking, LSTM, GRU
from keras.callbacks import Callback, EarlyStopping, CSVLogger, ReduceLROnPlateau, ModelCheckpoint
from keras.utils import to_categorical
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from time import sleep
import csv
import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import sys 
# reload(sys)

model_name = 'gru_512_charchar_baseline'
MAX_LEN = 280
WINDOW_SIZE =10
LSTM_MODEL = False

# Create dataset

In [4]:
# change to False if you want to train on datasets 
# where we don't use CUSTOM_NAME token for tagged Facebook names
custom_names = True

if custom_names:
    comments_filepath = "csv_data/CUSTOM_NAMES/custom_name_token_comments.csv"
else:
    comments_filepath = "csv_data/FILTERED_NAMES/filtered_names_comments.csv"

status_filepath = "csv_data/beaverconfessions_facebook_statuses.csv"
        
stop_symbol = "`"

def load_dataset(filepath, dictionary, datatype):
    dt = "{}_message".format(datatype)
    
    with open(filepath, "rU") as csvfile:
        reader = csv.DictReader(csvfile)
        sentences = []
        
        for status in reader:
            if dt not in status:
                # 2 rows are being read at a time for the statuses csv for some reason...
                # handled with monkey patching below
                msg1, msg2 = status.items()[1]

                for char in msg1:
                    dictionary.add(char)
                for char in msg2:
                    dictionary.add(char)
                
                # add stop symbol to end before text->int conversion
                msg1 += stop_symbol
                msg2 += stop_symbol
                sentences.append(msg1)
                sentences.append(msg2)
            else:
                msg = status[dt]
                for char in msg:
                    dictionary.add(char)
                msg += stop_symbol
                sentences.append(msg)
                    
    return dictionary, sentences
    
dictionary, comments = load_dataset(comments_filepath, set([]), "comment")
dictionary, statuses = load_dataset(status_filepath, dictionary, "status")
sentences = []
sentences.extend(comments)
sentences.extend(statuses)
sentences_arr = np.array(sentences)

# shuffle statuses and comments
random_permutation = np.random.permutation(len(sentences))
sentences_arr = sentences_arr[random_permutation]

print 'Sample sentences:'
print '-', sentences_arr[0]
print '-', sentences_arr[1]
print

# create vocabulary of characters found in data
chars = sorted(list(dictionary))
padding_symbol = "{"
chars.insert(0, stop_symbol) # index 1
chars.insert(0, padding_symbol) # index 0 
# print chars

# print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
# char_indices = dict((c, i+1) for i, c in enumerate(chars))
indices_char = {v: k for k, v in char_indices.iteritems()}
# indices_char = dict((i, c) for i, c in enumerate(chars))
nb_classes = len(char_indices)
print char_indices


# TODO separate model_related values and preprocessing code

def convert_sentences_to_subsentences(sentences, step_size=1):
    """
    X = sub_sentences, Y = next_sub_sentences
    Y is simply X shifted over by step_size.
    We want sub sentences per sentence to create multiple samples.
    """
    sub_sentences = []
    next_sub_sentences = []
    
    for sentence in sentences:
        for i in range(0, len(sentence) - WINDOW_SIZE, step_size):
            sub_sentences.append(sentence[i : i+WINDOW_SIZE])
            next_sub_sentences.append(sentence[(i+1) : (i+1)+WINDOW_SIZE])
    
    return sub_sentences, next_sub_sentences

sub_sentences, next_sub_sentences = convert_sentences_to_subsentences(sentences)
nb_samples = len(sub_sentences)

# any values not filled in later represent padding 
# extra 1 represents space for the stop symbol
X_labels = np.zeros((nb_samples, MAX_LEN+1))
y_labels = np.zeros((nb_samples, MAX_LEN+1))

for sample_nb in range(nb_samples):
    """
    We tokenize each sample (go from vocab to indices in vocab).
    We populate the zero-filled label matrices from above with the tokens,
        such that the end result are the tokenized, padded samples.
    """
    x_label = map(lambda x: char_indices[x], sub_sentences[sample_nb]) 
    y_label = map(lambda x: char_indices[x], next_sub_sentences[sample_nb])

    X_labels[sample_nb][:len(x_label)] = x_label
    y_labels[sample_nb][:len(y_label)] = y_label

# print X_labels[:1], y_labels[:1]
 
print('# training samples:', nb_samples)

In [13]:
# make use of subsentences, next_chars, which are our words (semantic, not encoded yet)

def validate_data(X, y):
    # assert at least one nonzero value in each OHE sample
    assert(False not in np.any(X>0, axis=1))
    assert(False not in np.any(y>0, axis=1))

def sampling_generator(epoch_size, batch_size, validation=False, sample_weights=False):  
    """
    Takes in labels of int values, e.g. [1. 2. 5. 9. 1. 0. 0. ...]
    These labels are already padded with 0s to a predetermined maxlen.
    We convert each sample X and y to one hot versions, and create batches.
    Batch generation is necessary to avoid MemoryError.
    """
    
    if validation:
        X_labels = X_val_labels 
        y_labels = y_val_labels 
    else:
        X_labels = X_train_labels
        y_labels = y_train_labels
    
    while True:
        # hand off data in batches
        for i in range(int(epoch_size/batch_size)):
            start = i * batch_size
            end = min(start + batch_size, epoch_size)
            true_batch_size = end - start

            # fresh batch
            sample_X = []
            sample_Y = []
            
            xs = X_labels[start:end]

            # one hot encode
            sample_X = to_categorical(X_labels[start:end], num_classes=nb_classes).reshape((batch_size,MAX_LEN+1,nb_classes))
            sample_Y = to_categorical(y_labels[start:end], num_classes=nb_classes).reshape((batch_size,MAX_LEN+1,nb_classes))
            
            total_nb_one_hots = xs.size
            sample_weights = np.ones((total_nb_one_hots, 1))
            
            temp_reshaped_xs = np.ravel(xs).reshape(-1)
            padded_positions = np.where(temp_reshaped_xs == 0)
            sample_weights[padded_positions] = 0 #28100 x 1
            sample_weights = sample_weights.reshape(true_batch_size, MAX_LEN+1)
#             sample_weights = np.repeat(sample_weights, nb_classes)
#             sample_weights = sample_weights.reshape(true_batch_size, MAX_LEN + 1, nb_classes) 
            
#             for i in range(true_batch_size):                 
                # ensure no 0 zeros in data - otherwise training NaNs out
                # validate_data(ohe_x, ohe_y)
                
                # add dim of size 1 to front of np arrays to allow for vstacking
#                 ohe_x = np.expand_dims(ohe_x, axis=0)
#                 ohe_y = np.expand_dims(ohe_y, axis=0)
                                
#                 # build the batch
#                 if len(sample_X) == 0:
#                     sample_X = ohe_x
#                     sample_Y = ohe_y
#                 else:
#                     sample_X = np.vstack((sample_X, ohe_x))
#                     sample_Y = np.vstack((sample_Y, ohe_y))

#             if sample_weights:
#                 # TODO fill in with reactions if so desired :)
#                 sample_weights = []
#                 yield (sample_x, sample_y, sample_weights)
            
            yield (sample_X, sample_Y, sample_weights)

# Build and run model

In [14]:
if LSTM_MODEL:
    print('Building LSTM model...')
    model = Sequential()
    # model.add(Masking(input_shape=(MAX_LEN+1,1)))
    model.add(LSTM(512, return_sequences=True, input_shape=(MAX_LEN+1,nb_classes)))
    model.add(LSTM(512, return_sequences=True))
    # model.add(Dropout(0.2))
    model.add(TimeDistributed(Dense(nb_classes)))
    model.add(Activation('softmax'))
    # if we want to apply sample weights to metrics, we need to specify weighted_metrics=[list of metrics]
    model.compile(loss='categorical_crossentropy', optimizer=Adam(clipnorm=1.0), sample_weight_mode="temporal")   
else:
    print('Building GRU model...')
    model = Sequential()
    model.add(GRU(512, return_sequences=True, input_shape=(MAX_LEN+1, nb_classes)))
#     model.add(Dropout(0.2))
    model.add(GRU(512, return_sequences=True))
#     model.add(Dropout(0.2))
    model.add(Dense(nb_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(clipnorm=1.0), sample_weight_mode="temporal")
    
print ('Model is made!')

In [15]:
model.summary()

In [6]:
# split into train and val data
X_train_labels, X_val_labels, y_train_labels, y_val_labels = train_test_split(X_labels, y_labels, test_size=0.3)
nb_train_samples = len(X_train_labels)  #200 #40000
nb_val_samples = len(X_val_labels) #200 #40000  
batch_size = 128 #nb_val_samples/2

class PlotHistory(Callback):
    def __init__(self, run_name):
        self.run_name = run_name 

    def on_train_begin(self, logs=None):
        self.epoch = []
        self.history = {}

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        self.epoch.append(epoch)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

        # create loss plot
        loss_handles = []
        for key in self.history:
            l, = plt.plot(self.history[key], label=key)
            loss_handles.append(l)

        plt.title('Losses and metrics for {}'.format(self.run_name))    
        plt.ylabel('loss')
        plt.yscale('symlog')
        plt.legend(handles=loss_handles, fontsize=6)          

        # make subplots close to each other and hide x ticks for all but bottom plot
        plt.savefig('{}_plot.jpg'.format(self.run_name))        
        plt.clf()

# callbacks
checkpointer = ModelCheckpoint(filepath='{}.hdf5'.format(model_name), verbose=1, save_best_only=True)
csv_logger = CSVLogger('{}.log'.format(model_name))
early_stopping = EarlyStopping(min_delta=0.001, monitor='val_loss', patience=5, verbose=1)
plot_history = PlotHistory(model_name)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5,
                                              verbose=1, epsilon=1e-4, min_lr=1E-6) 

# train
history = model.fit_generator(
    sampling_generator(nb_train_samples, batch_size), 
    steps_per_epoch=nb_train_samples/batch_size,
    epochs=50, #2,
    verbose=1,
    validation_data=sampling_generator(nb_val_samples, batch_size, validation=True),
    validation_steps=nb_val_samples/batch_size,
    callbacks=[early_stopping, reduce_lr, csv_logger, checkpointer, plot_history])
    
# sleep(0.1) # https://github.com/fchollet/keras/issues/2110 

hist = history.history
print 'Loss and val_loss is', hist['loss'][0], hist['val_loss'][0]

print history

KeyboardInterrupt: 

# Load model

In [2]:
from keras.models import load_model
model = load_model('{}.hdf5'.format(model_name))

# Calculate and plot perplexities

In [50]:
f = open(model_name+'.log','r')
epoch_X = []
train_perplex = []
val_perplex = []

for line in f.readlines()[1:]:
    data = line.strip().split(",")
    epoch_X.append(int(data[0]))
    train_perplex.append(2**(float(data[1])))
    val_perplex.append(2**(float(data[3])))
    
fig = plt.figure()
ax = plt.axes()

ax.plot(epoch_X, train_perplex)
ax.plot(epoch_X, val_perplex)
plt.legend(['Training perplexity', 'Validation perplexity'], loc='upper right')
plt.title("Change in perplexity over epochs")
plt.savefig('{}_perplexity.jpg'.format(model_name))
plt.clf()

# Make prediction

In [52]:
def convert_sentence_to_ohe(sentence):
    x_label = map(lambda x: char_indices[x], sentence)
    confession = np.zeros(MAX_LEN+1)
    confession[:len(x_label)] = x_label
    ohe_x = to_categorical(confession, num_classes=nb_classes)
    return np.expand_dims(ohe_x, axis=0)

from scipy.misc import logsumexp

def log_softmax(vec):
    return vec - logsumexp(vec)

def softmax(vec):
    return np.exp(log_softmax(vec))

def generate_confession(model, seed_string, temperature=0.5, sample=False):
    # nb chars to preserve
    orig_len = len(seed_string) 
    window_str = seed_string
    final_str = seed_string
        
    for char_nb in range(orig_len, MAX_LEN):
        x = convert_sentence_to_ohe(window_str)
        
        if sample:
            # get next char
            next_char_idx = min(WINDOW_SIZE, char_nb) - 1
        
            # helper function to sample an index from a probability array
            # indexing into 3D matrix -- get row of probs for single char at next_char_idx
            preds = model.predict(x)[0, next_char_idx, :]
            preds = np.asarray(preds).astype('float64')
            preds = np.log(preds) / temperature
            exp_preds = np.exp(preds)
            preds = exp_preds / np.sum(exp_preds)
            probas = np.random.multinomial(1, preds, 1)
            next_token = np.argmax(probas)
        else:
            preds = model.predict(x)[0] # otherwise wrapped in (1,maxlen+1,len(chars))
            best_tokens = np.argmax(preds, axis=1)
        
            # char_nb-1 because we want prev val in y matrix (best_tokens). In training we treat y as a shifted 
                # version of x hence offset -1 here
            if char_nb >= 10:
                next_token = best_tokens[9]
            else:
                next_token = best_tokens[char_nb-1] 
        
#         print "next_token:",next_token,"is ",indices_char[next_token]

        # stop symbol
        if next_token == 1:
            break
            
        next_char = indices_char[next_token]
        print 'current string:',window_str
    
        if len(window_str) == WINDOW_SIZE:
            moveConf = window_str[1:] + next_char
            window_str = moveConf
            print "\n new window_str:",window_str
        else:
            window_str += next_char
            
        final_str += next_char
        
        
        print "final str:",final_str
        
                
    return final_str


seed_strings=["", "#", "#1", "#12", "#124", "#1246", "#12465", "#9999", 
             "#1 Hi", "#12 Hi", "#124 Hi", "#1246 Hi", "#1293 Hi m",
             "#2568 ML", "#3476 Hixz", "#1321 I'm", "#2346 :)", "#5876 7:30am",
             "#9235 6.01", "#1246 6.867"]

with open('greedy_predictions_gru_baseline_charchar_softmax_temp_0.5.txt', 'w') as f:
    for seed_string in seed_strings:
        print seed_string
        confession = generate_confession(model, seed_string, temperature=0.5, sample=True)
        f.write(confession)
        f.write("\n") 
        