In [70]:
import keras
from keras.models import Sequential, Model
from keras.layers import *
from collections import Counter
import numpy as np
import pandas

In [71]:
vocab_size = 5000

In [78]:
class RNNTextModel:
    def __init__(self, input_vocab_size, embedding_dim, hidden_dim):
        self.input_vocab_size = input_vocab_size
        
        self.text_input = Input(shape=(140,), dtype='int32')
        
        self.text_model = Sequential()
        self.text_model.add(embeddings.Embedding(input_vocab_size, embedding_dim))
        self.text_model.add(recurrent.LSTM(hidden_dim, input_shape=(None, embedding_dim)))
        
        self.encoded_text = self.text_model(self.text_input)
        
        self.output = Dense(hidden_dim, activation='sigmoid')(self.encoded_text)
        
        self.model = Model(inputs=[self.text_input], outputs=self.output)
        self.model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
        
    def fit(self, texts, labels, **kwargs):
        return self.model.fit([texts], labels, kwargs)

In [73]:
class Preprocessor:
    def __init__(self, num_words=None, lower=True, char_level=False):
        self.tokenizer = keras.preprocessing.text.Tokenizer(num_words=num_words, filters='', lower=lower, char_level=char_level)

    def __call__(self, texts):
        self.tokenizer.fit_on_texts(texts)
        matrix = self.tokenizer.texts_to_matrix(texts)
        return matrix
    
    def vocab_size(self):
        return len(self.tokenizer.word_counts)

In [34]:
tweets = pandas.read_csv("trumptweets.csv")
tweets['count'] = 1
tweets['date'] = pandas.to_datetime(tweets['created_at'])
tweets['time'] = tweets['date'].dt.time
tweets['hour'] = tweets['date'].dt.hour
tweets['weekday'] = tweets['date'].dt.weekday
tweets['metatweet'] = tweets['text'].str.startswith('\"@')
tweets['contains_url'] = tweets['text'].str.find('http') >= 0
tweets['is_trump'] = tweets['source'] == 'Twitter for Android'
tweets.head()

Unnamed: 0,source,favorite_count,created_at,retweet_count,text,id_str,in_reply_to_user_id_str,is_retweet,count,date,time,hour,weekday,metatweet,contains_url,is_trump
0,Twitter Web Client,12,Wed Dec 23 17:38:18 +0000 2009,28,From Donald Trump: Wishing everyone a wonderfu...,6971079756,,False,1,2009-12-23 17:38:18,17:38:18,17,2,False,False,False
1,Twitter Web Client,6,Thu Dec 03 19:39:09 +0000 2009,33,Trump International Tower in Chicago ranked 6t...,6312794445,,False,1,2009-12-03 19:39:09,19:39:09,19,3,False,True,False
2,Twitter Web Client,11,Thu Nov 26 19:55:38 +0000 2009,13,Wishing you and yours a very Happy and Bountif...,6090839867,,False,1,2009-11-26 19:55:38,19:55:38,19,3,False,False,False
3,Twitter Web Client,3,Mon Nov 16 21:06:10 +0000 2009,5,Donald Trump Partners with TV1 on New Reality ...,5775731054,,False,1,2009-11-16 21:06:10,21:06:10,21,0,False,True,False
4,Twitter Web Client,6,Mon Nov 02 14:57:56 +0000 2009,7,"--Work has begun, ahead of schedule, to build ...",5364614040,,False,1,2009-11-02 14:57:56,14:57:56,14,0,False,False,False


In [74]:
is_2016 = tweets['date'].dt.year == 2016
is_iphone = tweets['source'] == 'Twitter for iPhone'
is_android = tweets['source'] == 'Twitter for Android'

train_set = tweets[(is_2016)&((is_iphone)|(is_android))]
tweet_texts = list(train_set['text'])

In [75]:
preproc = Preprocessor()
preproc_texts = preproc(tweet_texts)

In [76]:
vocab_size = preproc.vocab_size()
hidden_dim = 50
embedding_dim = 100

In [79]:
text_model = RNNTextModel(vocab_size, embedding_dim, hidden_dim)

In [None]:
text_model.fit()

def cross_validation(data, labels, folds=10):
    
    for i in range(folds):
        test_data = data[i::folds]
        for j in range(folds):
            if i != j:
                train_data += data[i::folds]
            
        
        