# GLOBALS AND INITIAL PARAMETERS

In [28]:
from emot import emo_unicode
from keras.callbacks import Callback, EarlyStopping, CSVLogger, ReduceLROnPlateau, ModelCheckpoint
from keras.layers import Dense, Activation, BatchNormalization, Embedding, Conv1D, Dropout, LSTM, TimeDistributed, GRU, Masking 
from keras.models import load_model, Sequential
from keras.optimizers import Adam
from keras.utils import to_categorical
from nltk.tokenize import word_tokenize #, wordpunct_tokenize
from sklearn.model_selection import train_test_split
from time import sleep
import csv
import matplotlib
import numpy as np
import re
import sys
import string

matplotlib.use('Agg')
from matplotlib import pyplot as plt 

# CHANGE PATH TO MODEL OUTPUTS HERE BASED ON YOUR LOCAL CONFIG
# BASE_PATH = '/Users/goldenzenith/Dropbox (MIT)/6.867_saved_models/'
# BASE_PATH = '6.867_saved_models/'
# FOLDER = 'GRU_WORDWORD/'
PATH = ''#BASE_PATH + FOLDER
print 'Model files will be saved to:', PATH

# plots, logs, weight files will be based on this
# plz name descriptively, e.g. lstm_512_charchar
MODEL_NAME = 'lstm_512_charchar_reacts' 

# approach
CHAR_BY_CHAR = True # True for character-by-character training
CUSTOM_NAMES = True # True to use CUSTOM_NAME token (not '') for tagged FB names

if CUSTOM_NAMES:
    COMMENTS_FILEPATH = "csv_data/CUSTOM_NAMES/custom_name_token_comments.csv"
else:
    COMMENTS_FILEPATH = "csv_data/FILTERED_NAMES/filtered_names_comments.csv"

# paths with data based on filtering done in adapted version of Facebook post scraping
STATUS_FILEPATH = "csv_data/beaverconfessions_facebook_statuses.csv" 

# parameters/settings
# MAX_LEN is defined later
EMBEDDING = False # True to use CNN
if EMBEDDING:
    CONV_NB_FILTERS = 100
    CONV_KERNEL_SIZE = 3
LSTM_MODEL = True # False to use GRU
HIDDEN_UNITS = 512
STEP_SIZE = 1
WINDOW_SIZE = 10 

# sample weights
WEIGHT_BY_REACTIONS = True # False to give all samples same weight
LIKES_ONLY = False # True for only likes, not all reactions

# special symbols
CUSTOM_NAME = '<name>'
CUSTOM_NUMBER = '<number>'
PADDING_SYMBOL = '<padding>'
STOP_SYMBOL = '<stahp>'
CUSTOM_SYMBOLS = [CUSTOM_NAME, CUSTOM_NUMBER, PADDING_SYMBOL, STOP_SYMBOL]

Model files will be saved to: 


# PRE-PROCESSING (TOKENIZATION) 

In [29]:
escaped_punctuation = re.escape(string.punctuation)
# print 'Escaped punctuation:', escaped_punctuation

# insert OR pipe before each punctuation mark
xor_punctuation = '|'.join('{}{}+'.format(escaped_punctuation[x], escaped_punctuation[x+1]) for x in range(0, len(escaped_punctuation), 2))
# print 'Delimited punctuation:', xor_punctuation

# build regex with variable (order matters!)
nm = CUSTOM_NAME.lower()
nb = CUSTOM_NUMBER.lower()
pd = PADDING_SYMBOL.lower()
sp = STOP_SYMBOL.lower()
course_number = '\d+\.\d{2,3}'
multiple_numbers = '\d+'
emoticon_pattern = '|'.join(emoticon for emoticon in emo_unicode.EMOTICONS)
space = '\s'
regex_expr = r'(' + '|'.join([nm, nb, pd, sp, course_number, multiple_numbers, emoticon_pattern, xor_punctuation, space]) + r')'
# print '\nRegex expression:', regex_expr

def replace_digit_with_token(string): 
    return CUSTOM_NUMBER.lower() if string.isdigit() else string

def tokenize_str(string, replace_digits=True):
    """
    NLTK tokenizers (word_tokenize, wordpunct_tokenize) are insufficient, 
    as emoticons and course #s, e.g. 6.111, are important to our dataset.
    
    This is a custom tokenizer to retain such items in the vocab,
        but split up other words containing numbers/punctuation, e.g. 3pm.
        
    Note that these texts are lowercased by default prior to tokenization.
    """
    if CHAR_BY_CHAR:
        tokens = list(string)
        
    else:
        tokens = re.split(regex_expr, string.lower())

        # filter out spaces
        tokens = [token for token in tokens if token not in ["", " "]]

        # tokens with *just* digits are mapped to <NUMBER> by default
        if replace_digits:
            tokens = map(lambda x: replace_digit_with_token(x), tokens)

    return tokens

# attempt to catch major cases in our dataset
sentence = CUSTOM_NAME + '6.867test. @7:30pm in 54-100. The profs, they\'re coming in 3..2..1. LETS DOOO THIS!!! >;)' + STOP_SYMBOL + PADDING_SYMBOL
print 'Test case:', tokenize_str(sentence)

Test case: ['<', 'n', 'a', 'm', 'e', '>', '6', '.', '8', '6', '7', 't', 'e', 's', 't', '.', ' ', '@', '7', ':', '3', '0', 'p', 'm', ' ', 'i', 'n', ' ', '5', '4', '-', '1', '0', '0', '.', ' ', 'T', 'h', 'e', ' ', 'p', 'r', 'o', 'f', 's', ',', ' ', 't', 'h', 'e', 'y', "'", 'r', 'e', ' ', 'c', 'o', 'm', 'i', 'n', 'g', ' ', 'i', 'n', ' ', '3', '.', '.', '2', '.', '.', '1', '.', ' ', 'L', 'E', 'T', 'S', ' ', 'D', 'O', 'O', 'O', ' ', 'T', 'H', 'I', 'S', '!', '!', '!', ' ', '>', ';', ')', '<', 's', 't', 'a', 'h', 'p', '>', '<', 'p', 'a', 'd', 'd', 'i', 'n', 'g', '>']


In [30]:
def load_dataset_and_vocabulary(filepath, vocabulary, datatype, max_nb_tokens, reactions):
    """
    Read individual sentences into memory, and generate vocabulary.
    
    If CHAR_BY_CHAR, the vocabulary will hold single characters, 
        e.g. a-zA-Z and punctuation.
    Else, it will contain whole words and Unicode 'phrases',
        e.g. :\'(, as split by our custom tokenizer.
    """
    dt = "{}_message".format(datatype)
    reaction_header = 'num_likes' if LIKES_ONLY else 'num_reactions'    
    
    with open(filepath, "rU") as csvfile:
        reader = csv.DictReader(csvfile)
        tokens = []
        
        for status in reader:
            # we distinguish between comments and statuses with header cols in the CSVs
            
            if dt not in status:
                # 2 rows are read at once from STATUS_FILEPATH for some reason...
                # handled with monkey patching 
                msg1, msg2 = status.items()[1]
                
                # append stop symbol so model has explicit marker for ending
                msg1 += STOP_SYMBOL
                msg2 += STOP_SYMBOL
                
                # we define elts as the unit we are training on, e.g. char vs. word 
                elts_in_sentence1 = tokenize_str(msg1)
                elts_in_sentence2 = tokenize_str(msg2)
                
                max_nb_tokens = max(max_nb_tokens, max(len(elts_in_sentence1), len(elts_in_sentence2)))
                 
                for elt in elts_in_sentence1:
                    vocabulary.add(elt)
                for elt in elts_in_sentence2:
                    vocabulary.add(elt) 
                    
                # build list of tokens in local memory
                tokens.append(elts_in_sentence1)
                tokens.append(elts_in_sentence2)
            else:
                msg = status[dt]
                nb_reactions = float(status[reaction_header])
                
                msg += STOP_SYMBOL 
                elts_in_sentence = tokenize_str(msg)
                max_nb_tokens = max(max_nb_tokens, len(elts_in_sentence))
                    
                for elt in elts_in_sentence:
                    vocabulary.add(elt)
                    
                tokens.append(elts_in_sentence)
                reactions.append(nb_reactions)
                    
    return vocabulary, tokens, max_nb_tokens, reactions
    
# load comments and statuses separately at first
max_nb_tokens = 0

vocabulary, comment_tokens, max_nb_tokens, comment_reactions = \
    load_dataset_and_vocabulary(COMMENTS_FILEPATH, set([]), "comment", max_nb_tokens, [])

vocabulary, status_tokens, max_nb_tokens, reactions = \
    load_dataset_and_vocabulary(STATUS_FILEPATH, vocabulary, "status", max_nb_tokens, comment_reactions)

MAX_LEN = max_nb_tokens # includes stop symbol

max_nb_reactions = max(reactions)
print '\nMax # of reactions:', max_nb_reactions

print 'Max # of tokens in a sentence:', MAX_LEN
print '\nSample of 10 processed sentences:'
for status in status_tokens[:5]:
    print status
    
# create vocabulary of characters found in data
if not CHAR_BY_CHAR:
    vocabulary.add(CUSTOM_NAME)
    vocabulary.add(CUSTOM_NUMBER)
    
# these symbols did not exist in the original training set,
# so we can safely add them 
# NOTE: changed from insert(0,_), since we were adding duplicates
vocabulary.add(STOP_SYMBOL)
vocabulary.add(PADDING_SYMBOL)
vocabulary = sorted(list(vocabulary))

NB_CLASSES = len(vocabulary)

tokens_indices = dict((t, i) for i, t in enumerate(vocabulary))
indices_tokens = {v: k for k, v in tokens_indices.iteritems()}
PADDING_INDEX = tokens_indices[PADDING_SYMBOL]
STOP_INDEX = tokens_indices[STOP_SYMBOL]

print '\nPadding index:', PADDING_INDEX, 'and stop index:', STOP_INDEX

input_type = "character by character" if CHAR_BY_CHAR else "word by word"
# print "For", input_type, ", we have this vocabulary:", vocabulary, "\n of size", NB_CLASSES


Max # of reactions: 1147.0
Max # of tokens in a sentence: 287

Sample of 10 processed sentences:
['#', '9', '5', '4', '1', ' ', 'I', ' ', 'h', 'a', 'd', ' ', 'f', 'e', 'e', 'l', 'i', 'n', 'g', 's', ' ', 'f', 'o', 'r', ' ', 'y', 'o', 'u', ' ', 'l', 'a', 's', 't', ' ', 'y', 'e', 'a', 'r', ',', ' ', 'b', 'u', 't', ' ', 'I', ' ', 'w', 'a', 's', ' ', 'o', 'n', 'l', 'y', ' ', 'a', ' ', 'f', 'r', 'o', 's', 'h', ' ', 't', 'h', 'e', 'n', '.', ' ', 'N', 'o', 'w', ',', ' ', 'y', 'o', 'u', "'", 'r', 'e', ' ', 'a', ' ', 's', 'e', 'n', 'i', 'o', 'r', ',', ' ', 'a', 'n', 'd', ' ', 'I', ' ', 'd', 'o', 'n', "'", 't', ' ', 'k', 'n', 'o', 'w', ' ', 'i', 'f', ' ', 'y', 'o', 'u', "'", 'd', ' ', 'b', 'e', ' ', 'w', 'i', 'l', 'l', 'i', 'n', 'g', ' ', 't', 'o', ' ', 'm', 'a', 'k', 'e', ' ', 't', 'h', 'e', ' ', 'e', 'm', 'o', 't', 'i', 'o', 'n', 'a', 'l', ' ', 'i', 'n', 'v', 'e', 's', 't', 'm', 'e', 'n', 't', '.', ' ', 'I', ' ', 'k', 'n', 'o', 'w', ' ', 'I', "'", 'd', ' ', 'b', 'e', ' ', 'w', 'i', 'l', 'l', '

# BUILD TRAINING SETS

In [31]:
# merge comments and statuses
tokens = []
tokens.extend(comment_tokens)
tokens.extend(status_tokens)
tokens = np.array(tokens)

print 'Sample tokenized sentence:'
print tokens[0]
print

def generate_X_and_Y(sentences):
    """
    X = sub_sentences, Y = next_sub_sentences
    Y is simply X shifted over by step_size.
    We want sub sentences per sentence to create multiple samples.
    """
    sub_sentences = []
    next_sub_sentences = []
    
    sub_reactions = []
    
    for sentence_nb, sentence in enumerate(sentences):
        nb_sentence_reactions = reactions[sentence_nb]
        
        for i in range(0, len(sentence) - WINDOW_SIZE, STEP_SIZE):
            sub_reactions.append(nb_sentence_reactions)
            sub_sentences.append(sentence[i : i+WINDOW_SIZE])
            next_sub_sentences.append(sentence[(i+STEP_SIZE) : (i+STEP_SIZE)+WINDOW_SIZE])
    
    return sub_sentences, next_sub_sentences, sub_reactions

X, Y, Reacts = generate_X_and_Y(tokens)
nb_samples = len(X)
print 'Sample X:', X[0]
print 'Sample Y:', Y[0]
print 

def convert_tokens_to_int(X, Y, nb_samples):
    """
    Convert token to integer representations.
    
    Populate label matrices with ints,
    so result is padded and ready for training.
    """
    # IGNORE THIS COMMENT but DON'T REMOVE -> extra 1 represents space for the stop symbol
    input_shape = (nb_samples, MAX_LEN)
    X_labels = np.zeros(input_shape)
    y_labels = np.zeros(input_shape)

    for sample_nb in range(nb_samples):
        x_label = map(lambda x: tokens_indices[x], X[sample_nb]) 
        y_label = map(lambda x: tokens_indices[x], Y[sample_nb])

        X_labels[sample_nb][:len(x_label)] = x_label
        X_labels[sample_nb][len(x_label):] = PADDING_INDEX
        y_labels[sample_nb][:len(y_label)] = y_label
        y_labels[sample_nb][len(y_label):] = PADDING_INDEX
    
    return X_labels, y_labels

X_labels, y_labels = convert_tokens_to_int(X, Y, nb_samples)

print 'Sample X labels:\n', X_labels[:1]
print 'Sample Y labels:\n', y_labels[:1]
print '\nTOTAL NB OF TRAINING SAMPLES:', nb_samples

# split data into train and val sets
X_train_labels, X_val_labels, y_train_labels, y_val_labels = \
    train_test_split(X_labels, y_labels, test_size=0.3)
X_train_reacts, X_val_reacts = \
    Reacts[:len(X_train_labels)], Reacts[len(X_train_labels):]


Sample tokenized sentence:
['#', '3', '0', '5', '6', '4', ' ', 'S', 'o', 'm', 'e', 't', 'h', 'i', 'n', 'g', ' ', 's', 'o', 'm', 'e', 't', 'h', 'i', 'n', 'g', ' ', 'i', 't', "'", 's', ' ', 'w', 'h', 'a', 't', ' ', 'Y', 'O', 'U', ' ', 'm', 'a', 'k', 'e', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 'e', 'x', 'p', 'e', 'r', 'i', 'e', 'n', 'c', 'e', ',', ' ', 'm', 'a', 'y', 'b', 'e', ' ', 'y', 'o', 'u', ' ', 'j', 'u', 's', 't', ' ', 'g', 'r', 'e', 'w', ' ', 'a', 'n', 'd', ' ', 'l', 'e', 'a', 'r', 'n', 'e', 'd', ' ', 'm', 'o', 'r', 'e', ' ', 'f', 'r', 'o', 'm', ' ', 'i', 't', ' ', 't', 'h', 'a', 'n', ' ', 't', 'h', 'e', 'm', '<', 's', 't', 'a', 'h', 'p', '>']

Sample X: ['#', '3', '0', '5', '6', '4', ' ', 'S', 'o', 'm']
Sample Y: ['3', '0', '5', '6', '4', ' ', 'S', 'o', 'm', 'e']

Sample X labels:
[[  3.  19.  16.  21.  22.  20.   0.  53.  79.  77.  29.  29.  29.  29.
   29.  29.  29.  29.  29.  29.  29.  29.  29.  29.  29.  29.  29.  29.
   29.  29.  29.  29.  29.  29.  29.  29.  29.  29.  29. 

# BATCH GENERATOR

In [32]:
def validate_data(X, y):
    # assert at least 1 nonzero value in each OHE sample
    assert(False not in np.any(X>0, axis=1))
    assert(False not in np.any(y>0, axis=1))

def sampling_generator(epoch_size, batch_size, validation=False):  
    """
    Takes in labels of int values, e.g. [1. 2. 5. 9. 1. 0. 0. ...]
    These labels are already padded with 0s to a predetermined maxlen.
    We convert each sample X and y to one hot versions, and create batches.
    Batch generation is necessary to avoid MemoryError.
    """
    if validation:
        X_labels = X_val_labels
        y_labels = y_val_labels 
        react_labels = X_val_reacts
    else:
        X_labels = X_train_labels
        y_labels = y_train_labels
        react_labels = X_train_reacts
    
    while True:
        # hand off data in batches
        for i in range(int(epoch_size/batch_size)):
            start = i * batch_size
            end = min(start + batch_size, epoch_size)
            true_batch_size = end - start

            # fresh batch
            batch_X = []
            batch_Y = []
            batch_shape = (true_batch_size, MAX_LEN, NB_CLASSES)
            
            X_label_sample = X_labels[start:end]
            X_label_sample_reacts = react_labels[start:end]

            # one hot encode
            if EMBEDDING:
                batch_X = X_label_sample
            else:
                batch_X = to_categorical(X_label_sample, num_classes=NB_CLASSES).reshape(batch_shape)
            
            batch_Y = to_categorical(y_labels[start:end], num_classes=NB_CLASSES).reshape(batch_shape)                    
            
            total_nb_one_hots = X_label_sample.size
            sample_weights = np.ones((total_nb_one_hots, 1))
            
            temp_reshaped_Xs = np.ravel(X_label_sample).reshape(-1)
            padded_positions = np.where(temp_reshaped_Xs == PADDING_INDEX)
            sample_weights[padded_positions] = 0 # 28100 x 1
            sample_weights = sample_weights.reshape(true_batch_size, MAX_LEN)
            
            if WEIGHT_BY_REACTIONS:
                # turn into col vector
                react_weights = np.array(X_label_sample_reacts).reshape(-1,1)
                react_weights = (react_weights / max_nb_reactions) + 1
                
                sample_weights = np.multiply(sample_weights, react_weights)
            
            yield (batch_X, batch_Y, sample_weights)
            
            
# next(sampling_generator(200, 128, validation=False))

# MODEL ARCHITECTURE

In [25]:
def build_model():
    print('Building model...')
    model = Sequential()
    
    if EMBEDDING:
        model.add(Embedding(input_dim=NB_CLASSES, output_dim=64, input_length=MAX_LEN))
        model.add(Conv1D(CONV_NB_FILTERS, CONV_KERNEL_SIZE, padding='same', activation='linear', strides=1))
        model.add(Activation('relu'))
        next_inp_shape = (MAX_LEN, CONV_NB_FILTERS)
    else:
        next_inp_shape = (MAX_LEN, NB_CLASSES)        
    
    if LSTM_MODEL:
        model.add(LSTM(512, return_sequences=True, input_shape=next_inp_shape))
        model.add(Dropout(0.2))
        model.add(LSTM(512, return_sequences=True))
        model.add(TimeDistributed(Dense(NB_CLASSES)))
    else:
        model.add(GRU(512, return_sequences=True, input_shape=next_inp_shape))
        model.add(Dropout(0.2))
        model.add(GRU(512, return_sequences=True))
        model.add(TimeDistributed(Dense(NB_CLASSES)))

    model.add(Activation('softmax'))
    
    # to apply sample weights to metrics, specify weighted_metrics=[list of metrics]
    model.compile(loss='categorical_crossentropy', optimizer=Adam(clipnorm=1.0), sample_weight_mode="temporal") 
    model.summary()
    return model

model = build_model()

Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 281, 512)          1243136   
_________________________________________________________________
dropout_1 (Dropout)          (None, 281, 512)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 281, 512)          2099200   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 281, 94)           48222     
_________________________________________________________________
activation_1 (Activation)    (None, 281, 94)           0         
Total params: 3,390,558
Trainable params: 3,390,558
Non-trainable params: 0
_________________________________________________________________


# CALLBACKS (bells + whistles)

In [22]:
class PlotHistory(Callback):
    def __init__(self, path, run_name):
        self.path = path
        self.run_name = run_name 

    def on_train_begin(self, logs=None):
        self.epoch = []
        self.history = {}

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        self.epoch.append(epoch)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

        # create loss and perplexity plot
        loss_handles = []
        train_perp = []
        val_perp = []
        for key in self.history:           
            if key != "lr":
                l, = plt.plot(self.history[key], label=key)
                loss_handles.append(l)

        plt.title('Losses and metrics for {}'.format(self.run_name))    
        plt.ylabel('loss')
        plt.yscale('symlog')
        plt.legend(["Train Loss","Val Loss"], fontsize=8, loc='upper right')          
        plt.savefig('{}_plot.jpg'.format(self.path+self.run_name))        
        plt.clf()
    
checkpointer = ModelCheckpoint(filepath='{}.hdf5'.format(PATH+MODEL_NAME), verbose=1, save_best_only=True)
csv_logger = CSVLogger('{}.log'.format(PATH+MODEL_NAME))
early_stopping = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.001, verbose=1)
plot_history = PlotHistory(PATH, MODEL_NAME)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1, epsilon=1e-4, min_lr=1E-6) 

# PERPLEXITY

In [23]:
def plotPerplexity(path, name):
    f = open(path+name+'.log','r')
    epoch_X = []
    train_perplex = []
    val_perplex = []

    for line in f.readlines()[1:]:
        data = line.strip().split(",")
        epoch_X.append(int(data[0]))
        train_perplex.append(2**(float(data[1])))
        val_perplex.append(2**(float(data[3])))

    fig = plt.figure()
    ax = plt.axes()

    ax.plot(epoch_X, train_perplex)
    ax.plot(epoch_X, val_perplex)
    plt.legend(['Train perplexity', 'Val perplexity'], fontsize=8, loc='upper right')
    plt.title('Perplexities for {}'.format(name))
    plt.savefig('{}_perplexity.jpg'.format(path+name))
    plt.clf()

# TRAIN MODEL

In [28]:
# True for print statements to show in terminal instead
PRINT_TRAIN_PROGRESS_TO_TERMINAL = True 
DEBUG_MODE = False

if PRINT_TRAIN_PROGRESS_TO_TERMINAL:
    reload(sys)    

if DEBUG_MODE:
    small_samples = 200
    nb_train_samples = small_samples
    nb_val_samples = small_samples
    batch_size = 128
    NB_EPOCHS = 2
else:
    nb_train_samples = len(X_train_labels)
    nb_val_samples = len(X_val_labels)
    batch_size = 128
    NB_EPOCHS = 20

print 'Total training samples:', nb_train_samples
print 'Total val samples:', nb_val_samples

In [30]:
def train(model):
    # shuffling of training data True by default
    print 'Training...'
    history = model.fit_generator(
        sampling_generator(nb_train_samples, batch_size), 
        steps_per_epoch=nb_train_samples/batch_size,
        epochs=NB_EPOCHS,
        verbose=1,
        validation_data=sampling_generator(nb_val_samples, batch_size, validation=True),
        validation_steps=nb_val_samples/batch_size,
        callbacks=[early_stopping, reduce_lr, csv_logger, checkpointer, plot_history])

    hist = history.history
    plotPerplexity(PATH, MODEL_NAME)
#     print 'Loss:', hist['loss'][0], 'and val loss:', hist['val_loss'][0]
    print '\nFull training history:\n', hist

In [None]:
train(model)

# LOAD MODEL

In [26]:
model_path = MODEL_NAME+'.hdf5'#'/Users/goldenzenith/Dropbox\ \(MIT\)/6.867_saved_models/GRU_wordword/gru_512_wordword.hdf5'
model = load_model(model_path)
print model

<keras.models.Sequential object at 0x119f6d050>


# TEXT GENERATION

In [34]:
def convert_sentence_to_ohe(sentence):
    x_label = map(lambda x: tokens_indices[x], sentence)
    confession = np.zeros(MAX_LEN)
    confession[:len(x_label)] = x_label
    if EMBEDDING:
        return np.reshape(confession,(1,-1))
    else:
        ohe_x = to_categorical(confession, num_classes=NB_CLASSES)
        return np.expand_dims(ohe_x, axis=0)

from scipy.misc import logsumexp

def log_softmax(vec):
    return vec - logsumexp(vec)

def softmax(vec):
    return np.exp(log_softmax(vec))

def generate_confession(model, seed_string, seed_list, temperature, sample, thresh, confidence):
    # nb chars to preserve
    if CHAR_BY_CHAR:
        orig_len = len(seed_string) 
    else:
        orig_len = len(seed_list) 
        
    window_str = seed_string
    window_list = seed_list
    final_str = seed_string 
        
    for unit_nb in range(orig_len, MAX_LEN):
        if CHAR_BY_CHAR:
            x = convert_sentence_to_ohe(window_str)
        else:
#             print "list here is", window_list
            x = convert_sentence_to_ohe(window_list)
    
        if sample:
            # get next char
            next_char_idx = min(WINDOW_SIZE, unit_nb) - 1
        
            # helper function to sample an index from a probability array
            # indexing into 3D matrix -- get row of probs for single char at next_char_idx
            preds = model.predict(x)[0, next_char_idx, :]
            preds = np.asarray(preds).astype('float64')
            preds = np.log(preds) / temperature
            exp_preds = np.exp(preds)
            preds = exp_preds / np.sum(exp_preds)
            probas = np.random.multinomial(1, preds, 1)
            next_token = np.argmax(probas)
        
            maxConf = np.max(preds)
#             print "max conf",maxConf
            if maxConf > thresh:
                confidence += 1
        else:
            preds = np.squeeze(model.predict(x),axis=0) # otherwise wrapped in (1,maxlen+1,len(chars))
            best_tokens = np.argmax(preds, axis=1)
#             print preds
#             print best_tokens
            
#             y = preds[0,:]
#             plt.plot(y.tolist())
#             plt.savefig("testttt.jpg")
#             plt.clf()

            if unit_nb >= WINDOW_SIZE:
                maxConf = np.max(preds[WINDOW_SIZE-1])
            else:
                maxConf = np.max(preds[unit_nb-1])
                
            if maxConf > thresh:
                confidence += 1
            
            # unit_nb-1 because we want prev val in y matrix (best_tokens). In training we treat y as a shifted 
            # version of x hence offset -1 here
            if unit_nb >= WINDOW_SIZE:
                next_token = best_tokens[WINDOW_SIZE-1]
            else:
                next_token = best_tokens[unit_nb-1]
                
        # stop symbol
#         if next_token == STOP_INDEX:
        if "<stahp>" in window_str:
            print "we here"
            if CHAR_BY_CHAR:
                confession_list = final_str
            else:
                confession_list = final_str.split()
            print "confidence here",confidence 
            print "len of confession here",len(confession_list)
            return final_str,confidence/float(len(confession_list))
        
        next_char = indices_tokens[next_token]
        
        if CHAR_BY_CHAR:
            if len(window_str) == WINDOW_SIZE:
                moveConf = window_str[1:] + next_char
                window_str = moveConf
            else:
                window_str += next_char
            final_str += next_char
        else:
            window_list.append(next_char)
            if len(window_list)-1 == WINDOW_SIZE:
                moveConf = window_list[1:]
                window_list = moveConf

            if window_str != '#':
                window_str += ' '+next_char
                final_str += ' '+next_char
            else:
                window_str += next_char
                final_str += next_char
            
        print "final str:",final_str
        
#     print "\n this",final_str,confidence
                
    if CHAR_BY_CHAR:
        confession_list = final_str
    else:
        confession_list = final_str.split()
    print "confidence there",confidence
    print "len of confession there",len(confession_list)
    return final_str,confidence/float(len(confession_list))


# Run and place into .txt file
ARG_MAX = False
if ARG_MAX:
    filename = MODEL_NAME+'_preds.txt'
    sample_bool = False
else:
    filename = MODEL_NAME+'_temp_0_5_preds.txt'
    sample_bool = True
    
# seed_strings_cc=["#", "#1", "#12", "#124", "#1246", "#12465", "#9999", 
#              "#1 Hi", "#12 Hi", "#124 Hi", "#1246 Hi", "#1293 Hi m",
#              "#2568 ML", "#3476 Hixz", "#1321 I'm", "#2346 :)", "#5876 7:30am",
#              "#9235 6.01", "#1246 6.867"]
seed_strings_cc=["#", "#3476", "#3476 i",
             "#3476 i'm", "#3476 :)", 
             "#3476 7:30am",
             "#3476 6.01"]
seed_strings_ww=["# ", "# <number>", "# <number> i",
             "# <number> i ' m", "# <number> :)", 
             "# <number> <number> :3 <number> am",
             "# <number> 6.01"]
conf_list = []
confidence = 0

#temps = 0.2, 0.5, 0.1 #and argmax

with open(filename, 'w') as f:
    print "Writing to",filename
    if CHAR_BY_CHAR:
        seed_list = seed_strings_cc
    else:
        seed_list = seed_strings_ww
        
    for seed_string in seed_list:
        print "seed_string:",seed_string
        seed_list = seed_string.split()
        #set to true for temperature sampling version
        confession, confidence = generate_confession(model, seed_string, seed_list, 0.5, sample_bool, 0.8, 0) 
        print "confession:",confession
        print "confidence:", confidence
        conf_list.append(confidence)
        
#         if len(conf_list) >1:
#             break
        f.write(confession)
        f.write("\n")
        
    avg_conf = sum(conf_list)/len(conf_list)
    f.write("confidence list: "+ str(conf_list))
    f.write("\n")
    f.write("average confidence: "+ str(avg_conf))
        
print "confidence for test set is:",conf_list

Writing to lstm_512_charchar_reacts_temp_0_5_preds.txt
seed_string: #
final str: #1
final str: #12
final str: #121
final str: #1215
final str: #1215 
final str: #1215 I
final str: #1215 I 
final str: #1215 I n
final str: #1215 I ne
final str: #1215 I nee
final str: #1215 I need
final str: #1215 I need 
final str: #1215 I need t
final str: #1215 I need th
final str: #1215 I need the
final str: #1215 I need them
final str: #1215 I need them 
final str: #1215 I need them o
final str: #1215 I need them or
final str: #1215 I need them or 
final str: #1215 I need them or d
final str: #1215 I need them or do
final str: #1215 I need them or do 
final str: #1215 I need them or do y
final str: #1215 I need them or do yo
final str: #1215 I need them or do you
final str: #1215 I need them or do you 
final str: #1215 I need them or do you k
final str: #1215 I need them or do you kn
final str: #1215 I need them or do you kno
final str: #1215 I need them or do you know
final str: #1215 I need them or

final str: #1215 I need them or do you know you're saying that my parents will my secret fetish is not you are interesting.<stah
final str: #1215 I need them or do you know you're saying that my parents will my secret fetish is not you are interesting.<stahp
final str: #1215 I need them or do you know you're saying that my parents will my secret fetish is not you are interesting.<stahp>
we here
confidence here 79
len of confession here 119
confession: #1215 I need them or do you know you're saying that my parents will my secret fetish is not you are interesting.<stahp>
confidence: 0.663865546218
seed_string: #3476
final str: #3476 
final str: #3476 T
final str: #3476 Th
final str: #3476 The
final str: #3476 The 
final str: #3476 The s
final str: #3476 The se
final str: #3476 The sec
final str: #3476 The seco
final str: #3476 The secon
final str: #3476 The second
final str: #3476 The second 
final str: #3476 The second o
final str: #3476 The second of
final str: #3476 The second of 
fin

final str: #3476 i would also talk to about the convenience and some Chinese person who you are interesting.<sta
final str: #3476 i would also talk to about the convenience and some Chinese person who you are interesting.<stah
final str: #3476 i would also talk to about the convenience and some Chinese person who you are interesting.<stahp
final str: #3476 i would also talk to about the convenience and some Chinese person who you are interesting.<stahp>
we here
confidence here 71
len of confession here 104
confession: #3476 i would also talk to about the convenience and some Chinese person who you are interesting.<stahp>
confidence: 0.682692307692
seed_string: #3476 i'm
final str: #3476 i'm 
final str: #3476 i'm s
final str: #3476 i'm st
final str: #3476 i'm sti
final str: #3476 i'm stil
final str: #3476 i'm still
final str: #3476 i'm still 
final str: #3476 i'm still a
final str: #3476 i'm still af
final str: #3476 i'm still afr
final str: #3476 i'm still afra
final str: #3476 i'm sti

final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It heso
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesol
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesola
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolat
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolati
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolatio
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation,
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation, 
f

final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation, bullying is overhyped for progress and have sex with 
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation, bullying is overhyped for progress and have sex with m
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation, bullying is overhyped for progress and have sex with my
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation, bullying is overhyped for progress and have sex with my 
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation, bullying is overhyped for progress and have sex with my U
final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation, bullying is overhyped for progress a

final str: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation, bullying is overhyped for progress and have sex with my UROP data in Excel wind a good oce<stahp>
we here
confidence here 153
len of confession here 200
confession: #3476 :) I told my source of shallow and see if you want to know what I'm done with me. It hesolation, bullying is overhyped for progress and have sex with my UROP data in Excel wind a good oce<stahp>
confidence: 0.765
seed_string: #3476 7:30am
final str: #3476 7:30amp
final str: #3476 7:30ampp
final str: #3476 7:30amppp
final str: #3476 7:30ampppp
final str: #3476 7:30amppppp
final str: #3476 7:30ampppppp
final str: #3476 7:30amppppppp
final str: #3476 7:30ampppppppp
final str: #3476 7:30amppppppppp
final str: #3476 7:30ampppppppppp
final str: #3476 7:30amppppppppppp
final str: #3476 7:30ampppppppppppp
final str: #3476 7:30amppppppppppppp
final str: #3476 7:30ampppppppppppppp
final str: #3476 7:30ampppppppppppp

final str: #3476 7:30amppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30amppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30amppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30amppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30amppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp

final str: #3476 7:30amppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30amppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30amppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp

final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1ppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1pppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1ppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1pppppppppppppppppppppppppppppppppp
final str: #3476 7:30amppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp

final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1ppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1pppppppppppppppppppppppppppppppppppp

final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1ppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp1ppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
final str: #3476 7:30ampppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp

final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusett
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts 
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts I
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts In
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Ind
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indr
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry 
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Ind

final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good 
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good t
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good th
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good thi
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good thin
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good thing
final str: #3476 6.

final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good thing that make sure to talk to him study bre
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good thing that make sure to talk to him study brea
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good thing that make sure to talk to him study break
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good thing that make sure to talk to him study break 
final str: #3476 6.01 sucks so there are so much here of my friends if you see Ole Massachusetts Indry to make this cruise I didn't even started leaving a good thing that