In [1]:
import re
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import keras
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, TimeDistributed, LSTM, CuDNNLSTM
from keras.optimizers import RMSprop
import os
import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def text_cleaner(row):
    word_list = [word for word in row.split(' ') if (len(word)>0) and (word[0] != '@') and (word[:4] != 'http') and (word != 'rt')]
    return ' '.join(word_list)

def create_index_char_map(corpus):
    chars = sorted(list(set(corpus)))
    global N_CHARS
    N_CHARS = len(chars)
    char_to_idx = {c: i for i, c in enumerate(chars)}
    idx_to_char = {i: c for i, c in enumerate(chars)}
    return chars, char_to_idx, idx_to_char

def create_sequences(corpus):
    sequences, next_chars = [], []
    for i in range(0, CORPUS_LENGTH - MAX_SEQ_LENGTH, SEQ_STEP):
        sequences.append(corpus[i:i + MAX_SEQ_LENGTH])
        next_chars.append(corpus[i + MAX_SEQ_LENGTH])
    global N_SEQS
    N_SEQS = len(sequences)
    return np.array(sequences), np.array(next_chars)

def one_hot_encode(sequences, next_chars, char_to_idx):
    X = np.zeros((N_SEQS, MAX_SEQ_LENGTH, N_CHARS), dtype=np.bool)
    y = np.zeros((N_SEQS, N_CHARS), dtype=np.bool)
    for i, sequence in enumerate(sequences):
        for t, char in enumerate(sequence):
            X[i, t, char_to_idx[char]] = 1
    y[i, char_to_idx[next_chars[i]]] = 1
    return X, y

def load_data(data, seq_length):
    chars = list(set(data))
    VOCAB_SIZE = len(chars)

    print('Data length: {} characters'.format(len(data)))
    print('Vocabulary size: {} characters'.format(VOCAB_SIZE))

    ix_to_char = {ix:char for ix, char in enumerate(chars)}
    char_to_ix = {char:ix for ix, char in enumerate(chars)}

    X = np.zeros((int(len(data)/seq_length), seq_length, VOCAB_SIZE))
    y = np.zeros((int(len(data)/seq_length), seq_length, VOCAB_SIZE))
    for i in range(0, int(len(data)/seq_length)):
        X_sequence = data[i*seq_length:(i+1)*seq_length]
        X_sequence_ix = [char_to_ix[value] for value in X_sequence]
        input_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            input_sequence[j][X_sequence_ix[j]] = 1.
            X[i] = input_sequence

        y_sequence = data[i*seq_length+1:(i+1)*seq_length+1]
        y_sequence_ix = [char_to_ix[value] for value in y_sequence]
        target_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            target_sequence[j][y_sequence_ix[j]] = 1.
            y[i] = target_sequence
    return X, y, VOCAB_SIZE, ix_to_char

In [3]:
df = pd.read_csv('data/marccarig_tweets.csv').dropna(subset=['text'])

In [5]:
df.head(5)

Unnamed: 0,favorite_count,source,text,in_reply_to_screen_name,is_retweet,created_at,retweet_count,id_str
1,0.0,TweetDeck,Shadows are going to make this game interestin...,,False,Sun Oct 02 19:21:28 +0000 2011,0.0,1.21e+17
3,0.0,Twitter Web Client,Markakis almost smokes Vazquez in the dugout. ...,,False,Wed Apr 28 00:59:08 +0000 2010,0.0,12976760000.0
5,0.0,Twitter Web Client,If Rivera indeed gets a chance at save No. 500...,,False,Mon Jun 29 02:56:18 +0000 2009,0.0,2381677000.0
7,0.0,Echofon,@jeffzrebiecsun they are scrumptious,jeffzrebiecsun,False,Tue Mar 19 14:07:11 +0000 2013,0.0,3.14e+17
9,0.0,Twitter Web Client,RT @dp57: At halloween store yesterday with my...,,False,Tue Oct 13 11:49:29 +0000 2009,0.0,4833040000.0


In [6]:
text = df['text'].str.lower()
text = text.loc[~text.str.startswith('rt @')]
text = text.apply(text_cleaner)
text = text.sample(25000)

In [8]:
text.head(5)

87379     you guys are all over the place on nova predic...
68431     talked to marlon byrd about steroids this morn...
136227    unlike my portfolio, i find this to be strong ...
42729     yankees notebook: andy pettitte feeling fine a...
145049    the yankees this week: home runs help team get...
Name: text, dtype: object

In [9]:
things_to_take_out = ['\x80','\x9c','\xa0', '¡','®','°','·','à','á','â','è','é', '\'', '\r', '\n', '\t',
                      'ê','í','ñ','ó','̶','\u200d','\u200f','–','—','‘','’','“','”','…','‼','⁉','≤','、',
                      'い','う','き','し','ち','っ','て','に','の','は','ょ','る','を','ウ','ク','サ','ス',
                      'ビ','ブ','ベ','ロ','ン','ー','取','才','断','決','私','能','️','🇦','🇨','🇮','🇰','🇳','🇵','🇷','🇸',
                      '🇹','🇺','\U0001f951','\U0001f985','\U0001f9e0', '\\']

corpus = [tweet for tweet in text.values]
corpus = ' '.join(corpus)

for char in things_to_take_out:
    corpus = corpus.replace(char, '')
    
corpus = corpus.replace('--', '-')

In [12]:
print(len(corpus))
print(corpus[:100])

1610509
you guys are all over the place on nova predictions. talked to marlon byrd about steroids this morni


In [23]:
list(set(corpus))

['/',
 '9',
 '🎂',
 'p',
 '%',
 '🌴',
 'm',
 'j',
 '_',
 '=',
 '8',
 'z',
 '😍',
 'l',
 ')',
 '🏡',
 '🎁',
 '🐮',
 '|',
 '&',
 'q',
 '-',
 '😎',
 '💯',
 'c',
 '!',
 'y',
 ';',
 '🎈',
 '♥',
 '🏕',
 '#',
 '❤',
 '~',
 '@',
 'g',
 ':',
 'o',
 '6',
 '🗽',
 '1',
 's',
 'w',
 ']',
 '🎣',
 '☕',
 'k',
 '🐟',
 '+',
 '🍴',
 '[',
 '(',
 '3',
 '😂',
 'a',
 '😢',
 'e',
 '^',
 '😄',
 'd',
 '👀',
 'b',
 '$',
 '💔',
 '5',
 't',
 '😀',
 '🎉',
 '😬',
 '🍺',
 '?',
 '😳',
 '4',
 '7',
 'x',
 '🙌',
 'f',
 '2',
 ' ',
 '🤔',
 '"',
 '.',
 'n',
 '*',
 '🐷',
 '0',
 '😥',
 '😒',
 '😱',
 '⛳',
 '☀',
 ',',
 '💀',
 'i',
 '😡',
 '🏞',
 '`',
 '✨',
 'h',
 'v',
 'r',
 '⚾',
 '😭',
 'u']

In [13]:
X, y, VOCAB_SIZE, ix_to_char = load_data(corpus, 85)

Data length: 1610509 characters
Vocabulary size: 105 characters


In [15]:
with open('models/idx_to_char', 'wb') as fp:
    pickle.dump(ix_to_char, fp)

In [16]:
def build_model(HIDDEN_DIM=512, dropout=0.3, LAYER_NUM=3):
    model = Sequential()
    model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
    model.add(Dropout(dropout))
    for i in range(LAYER_NUM - 1):
        model.add(LSTM(HIDDEN_DIM, return_sequences=True))
        model.add(Dropout(dropout))
    model.add(TimeDistributed(Dense(VOCAB_SIZE)))
    model.add(Activation('softmax'))
    model.compile(loss="categorical_crossentropy", optimizer="rmsprop")
    return model

model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, None, 512)         1265664   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 512)         2099200   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 512)         2099200   
_________________________________________________________________
dropout_3 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 105)         53865     
__________

In [None]:
def train_model(model, X, y, batch_size=128, nb_epoch=5000, verbose=0):
    checkpointer = ModelCheckpoint(filepath="models/v01weights_{epoch:02d}.hdf5", monitor='loss', verbose=verbose, save_best_only=True, mode='min')
    model.fit(X, y, batch_size=batch_size, epochs=nb_epoch, verbose=verbose, callbacks=[checkpointer])
    
train_model(model, X, y, verbose=1)