# INITIAL PARAMETERS

In [21]:
from keras.layers import Dense, Activation, Dropout, LSTM, TimeDistributed,Masking 
from keras.models import Sequential
from nltk.tokenize import wordpunct_tokenize
from sklearn.model_selection import train_test_split
from time import sleep
import csv
import numpy as np

# parameters
MAX_LEN = 280 if CHAR_BY_CHAR else 80 # found based on datasets
WINDOW_SIZE = 10 

# settings
CHAR_BY_CHAR = False # True if doing char by char
CUSTOM_NAMES = True # set as True to use CUSTOM_NAME token (and not empty str) for tagged Facebook names
MODEL_NAME = 'lstm_512_test_2' # name models descriptively, e.g. with param sizes like lstm_512_hidden

# paths
STATUS_FILEPATH = "csv_data/beaverconfessions_facebook_statuses.csv"

# symbols
STOP_SYMBOL = "`"

In [24]:
# paths with data based on filtering done in adapted version of Facebook post scraping
if CUSTOM_NAMES:
    comments_filepath = "csv_data/CUSTOM_NAMES/custom_name_token_comments.csv"
else:
    comments_filepath = "csv_data/FILTERED_NAMES/filtered_names_comments.csv"
        
def load_dataset_and_dictionary(filepath, dictionary, datatype, max_nb_words):
    """
    Read individual sentences into memory, and generate dictionary.
    
    If CHAR_BY_CHAR, the dictionary will hold single characters, 
        e.g. a-zA-Z and punctuation.
    Else, it will contain whole words and Unicode 'phrases',
        e.g. :\'(, as split by NLTK's tokenizer.
    """
    dt = "{}_message".format(datatype)
    
    with open(filepath, "rU") as csvfile:
        reader = csv.DictReader(csvfile)
        sentences = []
        
        for status in reader:
            # we distinguish between comments and statuses with header cols in the CSVs
            if dt not in status:
                # 2 rows are read at once from STATUS_FILEPATH for some reason...
                # handled with monkey patching 
                msg1, msg2 = status.items()[1]
                
                # we define elts as the unit we are training on, e.g. char vs. word
                if CHAR_BY_CHAR:
                    elts_in_sentence1 = msg1
                    elts_in_sentence2 = msg2
                else:
                    # use NLTK to get separations of words and numbers
                    # http://www.nltk.org/_modules/nltk/tokenize.html (install NLTK first!)
                    elts_in_sentence1 = wordpunct_tokenize(msg1)
                    elts_in_sentence2 = wordpunct_tokenize(msg2)
                    
                    # keep track of number of words in longest sentence
                    max_nb_words = max(max_nb_words, max(len(elts_in_sentence1), len(elts_in_sentence2)))
                    
                for elt in elts_in_sentence1:
                    dictionary.add(elt)
                for elt in elts_in_sentence2:
                    dictionary.add(elt)
                
                # add stop symbol to end of sentence
                msg1 += STOP_SYMBOL
                msg2 += STOP_SYMBOL
                
                # build list of sentences in local memory
                sentences.append(msg1)
                sentences.append(msg2)
            else:
                msg = status[dt]
                
                if CHAR_BY_CHAR:
                    elts_in_sentence = msg
                else:
                    elts_in_sentence = wordpunct_tokenize(msg)
                    max_nb_words = max(max_nb_words, len(elt_in_sentence))
                    
                for elt in elt_in_sentence:
                    dictionary.add(elt)
                    
                msg += STOP_SYMBOL
                sentences.append(msg)
                    
    return dictionary, sentences, max_nb_words
    
max_nb_words = 0
print comments_filepath, STATUS_FILEPATH
dictionary, comments, max_nb_words = load_dataset_and_dictionary(comments_filepath, set([]), "comment", max_nb_words)
dictionary, statuses, max_nb_words = load_dataset_and_dictionary(STATUS_FILEPATH, dictionary, "status", max_nb_words)

# sentences = []
# sentences.extend(comments)
# sentences.extend(statuses)
# sentences_arr = np.array(sentences)

# # shuffle statuses and comments
# random_permutation = np.random.permutation(len(sentences))
# sentences_arr = sentences_arr[random_permutation]

# print 'Sample sentences:'
# print '-', sentences_arr[0]
# print '-', sentences_arr[1]
# print

# # create vocabulary of characters found in data
# chars = sorted(list(dictionary))
# padding_symbol = "{"
# chars.insert(0, stop_symbol) # index 1
# chars.insert(0, padding_symbol) # index 0 
# # print chars

# # print('total chars:', len(chars))
# char_indices = dict((c, i) for i, c in enumerate(chars))
# char_indices = dict((c, i+1) for i, c in enumerate(chars))
# indices_char = {v: k for k, v in char_indices.iteritems()}
# # indices_char = dict((i, c) for i, c in enumerate(chars))
# nb_classes = len(char_indices)
# print char_indices


# # TODO separate model_related values and preprocessing code

# def convert_sentences_to_subsentences(sentences, step_size=1):
#     """
#     X = sub_sentences, Y = next_sub_sentences
#     Y is simply X shifted over by step_size.
#     We want sub sentences per sentence to create multiple samples.
#     """
#     sub_sentences = []
#     next_sub_sentences = []
    
#     for sentence in sentences:
#         for i in range(0, len(sentence) - WINDOW_SIZE, step_size):
#             sub_sentences.append(sentence[i : i+WINDOW_SIZE])
#             next_sub_sentences.append(sentence[(i+1) : (i+1)+WINDOW_SIZE])
    
#     return sub_sentences, next_sub_sentences

# sub_sentences, next_sub_sentences = convert_sentences_to_subsentences(sentences)
# nb_samples = len(sub_sentences)

# # any values not filled in later represent padding 
# # extra 1 represents space for the stop symbol
# X_labels = np.zeros((nb_samples, MAX_LEN+1))
# y_labels = np.zeros((nb_samples, MAX_LEN+1))

# for sample_nb in range(nb_samples):
#     """
#     We tokenize each sample (go from vocab to indices in vocab).
#     We populate the zero-filled label matrices from above with the tokens,
#         such that the end result are the tokenized, padded samples.
#     """
#     x_label = map(lambda x: char_indices[x], sub_sentences[sample_nb]) 
#     y_label = map(lambda x: char_indices[x], next_sub_sentences[sample_nb])

#     X_labels[sample_nb][:len(x_label)] = x_label
#     y_labels[sample_nb][:len(y_label)] = y_label

# # print X_labels[:1], y_labels[:1]
 
# print('# training samples:', nb_samples)

csv_data/CUSTOM_NAMES/custom_name_token_comments.csv csv_data/beaverconfessions_facebook_statuses.csv
78


In [26]:
print dictionary



# BATCH GENERATOR

In [11]:
from keras.utils import to_categorical
# import sys 

# reload(sys)
# make use of subsentences, next_chars, which are our words (semantic, not encoded yet)

def validate_data(X, y):
    # assert at least one nonzero value in each OHE sample
    assert(False not in np.any(X>0, axis=1))
    assert(False not in np.any(y>0, axis=1))

def sampling_generator(epoch_size, batch_size, validation=False, sample_weights=False):  
    """
    Takes in labels of int values, e.g. [1. 2. 5. 9. 1. 0. 0. ...]
    These labels are already padded with 0s to a predetermined maxlen.
    We convert each sample X and y to one hot versions, and create batches.
    Batch generation is necessary to avoid MemoryError.
    """
    
    if validation:
        X_labels = X_val_labels 
        y_labels = y_val_labels 
    else:
        X_labels = X_train_labels
        y_labels = y_train_labels
    
    while True:
        # hand off data in batches
        for i in range(int(epoch_size/batch_size)):
            start = i * batch_size
            end = min(start + batch_size, epoch_size)
            true_batch_size = end - start

            # fresh batch
            sample_X = []
            sample_Y = []
            
            xs = X_labels[start:end]

            # one hot encode
            sample_X = to_categorical(X_labels[start:end], num_classes=nb_classes).reshape((batch_size,MAX_LEN+1,nb_classes))
            sample_Y = to_categorical(y_labels[start:end], num_classes=nb_classes).reshape((batch_size,MAX_LEN+1,nb_classes))
            
            total_nb_one_hots = xs.size
            sample_weights = np.ones((total_nb_one_hots, 1))
            
            temp_reshaped_xs = np.ravel(xs).reshape(-1)
            padded_positions = np.where(temp_reshaped_xs == 0)
            sample_weights[padded_positions] = 0 #28100 x 1
            sample_weights = sample_weights.reshape(true_batch_size, MAX_LEN+1)
            
#             if sample_weights:
#                 # TODO fill in with reactions if so desired :)
#                 sample_weights = []
#                 yield (sample_x, sample_y, sample_weights)
            
            yield (sample_X, sample_Y, sample_weights)

# DEFINE MODEL ARCHITECTURE

In [4]:
# build the model: 2 stacked LSTM
print('Building model...')
model = Sequential()
# model.add(Masking(input_shape=(MAX_LEN+1,1)))
model.add(LSTM(512, return_sequences=True, input_shape=(MAX_LEN+1,nb_classes)))
model.add(LSTM(512, return_sequences=True))
# model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(nb_classes)))
model.add(Activation('softmax'))

# if we want to apply sample weights to metrics, we need to specify weighted_metrics=[list of metrics]
model.compile(loss='categorical_crossentropy', optimizer='Adam',sample_weight_mode="temporal") 
print ('Model is made!')

Building model...
Model is made!


In [40]:
model.summary()

# TRAIN MODEL

In [44]:
from keras.callbacks import Callback, EarlyStopping, CSVLogger, ReduceLROnPlateau, ModelCheckpoint
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import sys 

reload(sys)

# split into train and val data
nb_train_samples = 200 #40000 #len(X_train_labels) #300
nb_val_samples = 200 #40000 #len(X_val_labels) #300
batch_size = nb_val_samples/2 #128

class PlotHistory(Callback):
    def __init__(self, run_name):
        self.run_name = run_name 

    def on_train_begin(self, logs=None):
        self.epoch = []
        self.history = {}

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        self.epoch.append(epoch)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

        # losses
        loss_handles = []
        for key in self.history:
            l, = plt.plot(self.history[key], label=key)
            loss_handles.append(l)

        plt.title('losses and metrics for {}'.format(self.run_name))    
        plt.ylabel('loss')
        plt.yscale('symlog')
        plt.legend(handles=loss_handles, fontsize=6)          

        # make subplots close to each other and hide x ticks for all but bottom plot
        plt.savefig('{}_plot.jpg'.format(self.run_name))        
        plt.clf()

# callbacks
checkpointer = ModelCheckpoint(filepath='{}.hdf5'.format(MODEL_NAME), verbose=1, save_best_only=True)
csv_logger = CSVLogger('{}.log'.format(MODEL_NAME))
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
plot_history = PlotHistory(MODEL_NAME)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5,
                                              verbose=1, epsilon=1e-4, min_lr=1E-6) 

# train
history = model.fit_generator(
    sampling_generator(nb_train_samples, batch_size), 
    steps_per_epoch=nb_train_samples/batch_size,
    epochs=2, #10, 
    verbose=1,
    validation_data=sampling_generator(nb_val_samples, batch_size, validation=True),
    validation_steps=nb_val_samples/batch_size,
    callbacks=[early_stopping, reduce_lr, csv_logger, checkpointer, plot_history])
    
sleep(0.1) # https://github.com/fchollet/keras/issues/2110 

hist = history.history
print 'Loss and val_loss is', hist['loss'][0], hist['val_loss'][0]

print history

In [45]:
from keras.models import load_model
model = load_model('{}.hdf5'.format(MODEL_NAME))

# TEXT GENERATION

In [46]:
def convert_sentence_to_ohe(sentence):
    x_label = map(lambda x: char_indices[x], sentence)
    confession = np.zeros(MAX_LEN+1)
    confession[:len(x_label)] = x_label
    ohe_x = to_categorical(confession, num_classes=nb_classes)
    return np.expand_dims(ohe_x, axis=0)

def generate_confession(model, seed_string):
    # nb chars to preserve
    orig_len = len(seed_string) 
    window_str = seed_string
    final_str = seed_string
    
    for char_nb in range(orig_len, MAX_LEN):
        x = convert_sentence_to_ohe(window_str)
        
        # get next char
        preds = model.predict(x)[0] # otherwise wrapped in (1,maxlen+1,len(chars))
        best_tokens = np.argmax(preds, axis=1)
        print "best_tokens:",best_tokens
        
#         word = ""
#         for j in best_tokens:
#             word += indices_char[j]
#         print "predicted Y:",word
        
        # char_nb-1 because we want prev val in y matrix (best_tokens). In training we treat y as a shifted 
            # version of x hence offset -1 here
        if char_nb >= 10:
            next_token = best_tokens[9]
        else:
            next_token = best_tokens[char_nb-1] 
        
        print "next_token:",next_token,"is ",indices_char[next_token]

        # stop symbol
        if next_token == 1:
            break
            
        next_char = indices_char[next_token]
        print 'current string:',window_str
    
        if len(window_str) == WINDOW_SIZE:
            moveConf = window_str[1:] + next_char
            window_str = moveConf
            print "\n new window_str:",window_str
        else:
            window_str += next_char
            
        final_str += next_char
        
        
        print "final str:",final_str
        
                
    return final_str


seed_string="#"
print generate_confession(model, seed_string)

KeyboardInterrupt: 