In [1]:
from theano import function, config, shared, tensor
import numpy
import time

vlen = 10 * 30 * 768  # 10 x #cores x # threads per core
iters = 1000

rng = numpy.random.RandomState(22)
x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
f = function([], tensor.exp(x))
print(f.maker.fgraph.toposort())
t0 = time.time()
for i in range(iters):
    r = f()
t1 = time.time()
print("Looping %d times took %f seconds" % (iters, t1 - t0))
print("Result is %s" % (r,))
if numpy.any([isinstance(x.op, tensor.Elemwise) and
              ('Gpu' not in type(x.op).__name__)
              for x in f.maker.fgraph.toposort()]):
    print('Used the cpu')
else:
    print('Used the gpu')

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GRID K520 (CNMeM is disabled, cuDNN not available)


[GpuElemwise{exp,no_inplace}(<CudaNdarrayType(float32, vector)>), HostFromGpu(GpuElemwise{exp,no_inplace}.0)]
Looping 1000 times took 0.682529 seconds
Result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761
  1.62323296]
Used the gpu


In [38]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM, TimeDistributed,Masking 
from time import sleep
import csv
import numpy as np


model_name = 'lstm_512_test_2'
MAX_LEN = 280
WINDOW_SIZE =10

In [39]:
# change to False if you want to train on datasets 
# where we don't use CUSTOM_NAME token for tagged Facebook names
custom_names = True

if custom_names:
    comments_filepath = "csv_data/CUSTOM_NAMES/custom_name_token_comments.csv"
else:
    comments_filepath = "csv_data/FILTERED_NAMES/filtered_names_comments.csv"

status_filepath = "csv_data/beaverconfessions_facebook_statuses.csv"
        
stop_symbol = "`"

def load_dataset(filepath, dictionary, datatype):
    dt = "{}_message".format(datatype)
    
    with open(filepath, "rU") as csvfile:
        reader = csv.DictReader(csvfile)
        sentences = []
        
        for status in reader:
            if dt not in status:
                # 2 rows are being read at a time for the statuses csv for some reason...
                # handled with monkey patching below
                msg1, msg2 = status.items()[1]

                for char in msg1:
                    dictionary.add(char)
                for char in msg2:
                    dictionary.add(char)
                
                # add stop symbol to end before text->int conversion
                msg1 += stop_symbol
                msg2 += stop_symbol
                sentences.append(msg1)
                sentences.append(msg2)
            else:
                msg = status[dt]
                for char in msg:
                    dictionary.add(char)
                msg += stop_symbol
                sentences.append(msg)
                    
    return dictionary, sentences
    
dictionary, comments = load_dataset(comments_filepath, set([]), "comment")
dictionary, statuses = load_dataset(status_filepath, dictionary, "status")
sentences = []
sentences.extend(comments)
sentences.extend(statuses)
sentences_arr = np.array(sentences)

# shuffle statuses and comments
random_permutation = np.random.permutation(len(sentences))
sentences_arr = sentences_arr[random_permutation]

print 'Sample sentences:'
print '-', sentences_arr[0]
print '-', sentences_arr[1]
print

# create vocabulary of characters found in data
chars = sorted(list(dictionary))
padding_symbol = "{"
chars.insert(1, stop_symbol) # index 1
# chars.insert(0, padding_symbol) # index 0 
# print chars

# print('total chars:', len(chars))
char_indices = dict((c, i+1) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
nb_classes = len(char_indices)
print char_indices


# TODO separate model_related values and preprocessing code

def convert_sentences_to_subsentences(sentences, step_size=1):
    """
    X = sub_sentences, Y = next_sub_sentences
    Y is simply X shifted over by step_size.
    We want sub sentences per sentence to create multiple samples.
    """
    sub_sentences = []
    next_sub_sentences = []
    
    for sentence in sentences:
        for i in range(0, len(sentence) - WINDOW_SIZE, step_size):
            sub_sentences.append(sentence[i : i+WINDOW_SIZE])
            next_sub_sentences.append(sentence[(i+1) : (i+1)+WINDOW_SIZE])
    
    return sub_sentences, next_sub_sentences

sub_sentences, next_sub_sentences = convert_sentences_to_subsentences(sentences)
nb_samples = len(sub_sentences)

# any values not filled in later represent padding 
# extra 1 represents space for the stop symbol
X_labels = np.zeros((nb_samples, MAX_LEN+1))
y_labels = np.zeros((nb_samples, MAX_LEN+1))

for sample_nb in range(nb_samples):
    """
    We tokenize each sample (go from vocab to indices in vocab).
    We populate the zero-filled label matrices from above with the tokens,
        such that the end result are the tokenized, padded samples.
    """
    x_label = map(lambda x: char_indices[x], sub_sentences[sample_nb]) 
    y_label = map(lambda x: char_indices[x], next_sub_sentences[sample_nb])

    X_labels[sample_nb][:len(x_label)] = x_label
    y_labels[sample_nb][:len(y_label)] = y_label

# print X_labels[:1], y_labels[:1]
 
print('# training samples:', nb_samples)

Sample sentences:
- #1481 I walk on the second floor of the infinite to avoid the UGC beggers.`
- #3062 I'm a 5'2" guy, which has given me crippling social anxiety. Why should I even try living anymore? There's just no hope.`

{' ': 0, '$': 5, '(': 9, ',': 13, '0': 17, '4': 21, '8': 25, '<': 29, '@': 33, 'D': 37, 'H': 41, 'L': 45, 'P': 49, 'T': 53, 'X': 57, '`': 1, 'd': 67, 'h': 71, 'l': 75, 'p': 79, 't': 83, 'x': 87, '|': 90, '#': 4, "'": 8, '+': 12, '/': 16, '3': 20, '7': 24, ';': 28, '?': 32, 'C': 36, 'G': 40, 'K': 44, 'O': 48, 'S': 52, 'W': 56, '[': 60, '_': 63, 'c': 66, 'g': 70, 'k': 74, 'o': 78, 's': 82, 'w': 86, '"': 3, '&': 7, '*': 11, '.': 15, '2': 19, '6': 23, ':': 27, '>': 31, 'B': 35, 'F': 39, 'J': 43, 'N': 47, 'R': 51, 'V': 55, 'Z': 59, '^': 62, 'b': 65, 'f': 69, 'j': 73, 'n': 77, 'r': 81, 'v': 85, 'z': 89, '~': 92, '!': 2, '%': 6, ')': 10, '-': 14, '1': 18, '5': 22, '9': 26, '=': 30, 'A': 34, 'E': 38, 'I': 42, 'M': 46, 'Q': 50, 'U': 54, 'Y': 58, ']': 61, 'a': 64, 'e': 68,

In [36]:
from keras.models import load_model
model = load_model('{}.hdf5'.format(model_name))

In [37]:
def convert_sentence_to_ohe(sentence):
    x_label = map(lambda x: char_indices[x], sentence)
    confession = np.zeros(MAX_LEN+1)
    confession[:len(x_label)] = x_label
    ohe_x = to_categorical(confession, num_classes=nb_classes)
    return np.expand_dims(ohe_x, axis=0)

def generate_confession(model, seed_string):
    # nb chars to preserve
    orig_len = len(seed_string) 
    window_str = seed_string
    final_str = seed_string
    
    for char_nb in range(orig_len, MAX_LEN):
        x = convert_sentence_to_ohe(window_str)
        
        # get next char
        preds = model.predict(x)[0] # otherwise wrapped in (1,maxlen+1,len(chars))
        best_tokens = np.argmax(preds, axis=1)
        print "best_tokens:",best_tokens
        
        word = ""
        for j in best_tokens:
            word += indices_char[j]
        print "predicted Y:",word
        
        # char_nb-1 because we want prev val in y matrix (best_tokens). In training we treat y as a shifted 
            # version of x hence offset -1 here
        if char_nb >= 10:
            next_token = best_tokens[9]
        else:
            next_token = best_tokens[char_nb-1] 
        
        print "next_token:",next_token,"is ",indices_char[next_token]

        # stop symbol
        if next_token == 1:
            break
            
        next_char = indices_char[next_token]
        print 'current string:',window_str
    
        if len(window_str) == WINDOW_SIZE:
            moveConf = window_str[1:] + next_char
            window_str = moveConf
            print "\n new window_str:",window_str
        else:
            window_str += next_char
            
        final_str += next_char
        
        
        print "final str:",final_str
        
                
    return final_str


seed_string="#"
print generate_confession(model, seed_string)

best_tokens: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
predicted Y: `````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````
next_token: 0 is  `
current string: #
final str: #`
best_tokens: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

best_tokens: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
predicted Y: `````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````
next_token: 0 is  `
current string: #`````````

 new window_str: ``````````
final str: #``````````
best_tokens: [0 0 0 0 

best_tokens: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
predicted Y: `````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````
next_token: 0 is  `
current string: ``````````

 new window_str: ``````````
final str: #```````````````````
best_tokens: 

KeyboardInterrupt: 