In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import nltk
import json
from nltk.corpus import stopwords
from string import punctuation
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from keras import optimizers
from keras.layers import Input, Dense, Embedding, Conv1D, Dropout, MaxPooling1D, Activation, Flatten
from keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from collections import defaultdict

Using TensorFlow backend.


In [2]:
# Removes stop words.
def text_tokens(tweets,text_parse):
    
    tweet_tokens = []
    
    for tweet in tweets:
        
        token_list = []
        tokens = tknzr.tokenize(tweet)
        
        for token in tokens:
            
            token_value=token.lower()
            
            if token_value not in text_parse:
                
                token_list.append(token_value)
        
        tweet_tokens.append(token_list)
    
    return tweet_tokens

In [3]:
# Creates vocabulary.
def vocab_build(token_list):
    
    vocab = defaultdict()
    feature_vocab = defaultdict()
    vocab.default_factory = vocab.__len__
    
    for tokens in token_list:
        
        for word in tokens:
                
            vocab[word]
            feature_vocab[vocab[word]] = word
    
    return vocab,feature_vocab

In [4]:
# Computes document term frequency for vocabulary.
def document_build(token_list,vocab):
    
    document_term = {}
    
    for term,_ in vocab.items():
    
        document_term[term] = 0
    
    for tokens in token_list:
        
        for word in tokens:
            
            try:
                
                document_term[word] += 1
            
            except:
                
                continue
    
    return document_term

In [5]:
# Calculates TF-IDF for all samples.
def total_importance(token_list,vocab):
    
    ti = {}
    document_term = document_build(token_list,vocab)
    documents = len(token_list)
    
    for idx,tokens in enumerate(token_list):
        
        ti[idx] = {}
        frequency = nltk.FreqDist(tokens)
        
        for term,_ in frequency.items():
            
            try:
            
                feature_idx = vocab[term]
                ti[idx][feature_idx] = frequency[term] * np.log(documents/document_term[term])
            
            except:
                
                continue
    
    return ti

In [6]:
# Loads word embeddings from GloVe trained model.
def get_glove_embeddings(filename):
    
    fh = open(filename,'r',buffering=4096,encoding='UTF-8')
    
    word_embeddings = {}
    vec_float = np.vectorize(float)
    
    for line in fh:
        
        vec = line.split()
        
        word_embeddings[vec[0]] = vec_float(vec[1:])
    
    return word_embeddings

In [7]:
# Loads word embeddings from Word2Vec trained model.
def get_w2v_embeddings(filename):
    
    with open(filename,'r') as f:
        
        data = json.load(f)
    
    words = list(data.keys())
    w2v_embeddings = {}
    array = np.vectorize(float)
    
    for word in words:
        
        w2v_embeddings[word] = array(data[word])
    
    return w2v_embeddings

In [8]:
# Loads word embeddings from FastText trained model.
def get_fast_text_embeddings(filename):
    
    fh = open(filename,'r',buffering=4096,encoding='UTF-8')
    
    fast_text_embeddings = {}
    array = np.vectorize(float)
    
    for idx,line in enumerate(fh):
        
        vec = line.split()
        fast_text_embeddings[vec[0]] = array(vec[1:])
    
    return fast_text_embeddings

In [9]:
# Embeds top 14 most important words per tweet based on TF-IDF (Avg token count is 14). 
def embed_tweets(token_list,w2v_embeddings,ti,vocab,word_number):
    
    X_embed = np.zeros((len(token_list),word_number,300))
    
    for idx,tokens in enumerate(token_list):
        
        words = []
        words_ti = []
        
        for token in tokens:
            
            words.append(token)
            
            try:
                
                words_ti.append(ti[idx][vocab[token]])
            
            except:
                
                words_ti.append(0)
        
        if len(words) < word_number:
            
            words += ['Pad']*(word_number-len(words))
            words_ti += [0]*(word_number-len(words_ti))
            idx_sort = np.flip(np.argsort(words_ti),axis=0)
            words = list(np.array(words)[idx_sort])
            words_ti = list(np.array(words_ti)[idx_sort])
        
        else:
            
            idx_sort = np.flip(np.argsort(words_ti)[-word_number:],axis=0)
            words = list(np.array(words)[idx_sort])
            words_ti = list(np.array(words_ti)[idx_sort])
        
        embeddings = []
        
        for word_idx,word in enumerate(words):
            
            embeddings.append(words_ti[word_idx]*w2v_embeddings.get(word,np.zeros(300)))
        
        embeddings = np.array(embeddings)
        X_embed[idx] = embeddings
   
    return X_embed

In [10]:
# Generates features.
def feature_generation(df,stop_words):
    
    df['tokens'] = text_tokens(df.text.values,stop_words)
    
    df = pd.DataFrame(df.values,index=list(range(len(df))),columns=df.columns)
    token_idx = [idx for idx,token in enumerate(df.tokens.values) if len(token) == 0]
    df = df.drop(token_idx,axis=0)
    
    df['daily_diff'] = df.close-df.open
    df['up_down'] = [1 if diff > 0 else 0 for diff in df.daily_diff]

    data = df.values
    
    xtr,xte,ytr,yte = train_test_split(data[:,:-1],data[:,-1],test_size=.2,stratify=data[:,-1])
    xtr,x_val,ytr,y_val = train_test_split(xtr,ytr,test_size=.2,stratify=ytr)
    
    df_train = pd.DataFrame(xtr,columns=list(df.columns[:-1]))
    df_test = pd.DataFrame(xte,columns=list(df.columns[:-1]))
    df_val = pd.DataFrame(x_val,columns=list(df.columns[:-1]))
    
    train_tokens = df_train.tokens.values
    test_tokens = df_test.tokens.values
    val_tokens = df_val.tokens.values
    
    vocab,feature_vocab = vocab_build(train_tokens)
        
    w2v_embeddings = get_w2v_embeddings('../w2v_embeddings.json')
    
    ti_train = total_importance(train_tokens,vocab)
    ti_test = total_importance(test_tokens,vocab)
    ti_val = total_importance(val_tokens,vocab)
    
    train_embed = embed_tweets(train_tokens,w2v_embeddings,ti_train,vocab,14)
    test_embed = embed_tweets(test_tokens,w2v_embeddings,ti_test,vocab,14)
    val_embed = embed_tweets(val_tokens,w2v_embeddings,ti_val,vocab,14)
    
    ytr = np.array([ytr.astype(int)]).T
    yte = np.array([yte.astype(int)]).T
    y_val = np.array([y_val.astype(int)]).T
    
    encoder = OneHotEncoder()
    
    y_train = encoder.fit_transform(ytr)
    y_test = encoder.fit_transform(yte)
    y_val = encoder.fit_transform(y_val)
    
    return train_embed,test_embed,val_embed,y_train,y_test,y_val

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
tknzr = RegexpTokenizer('[a-zA-Z]+')

In [13]:
punctuation_parse = [mark for mark in punctuation]
word_parse = stopwords.words('english')
stop_words = punctuation_parse + word_parse

In [14]:
df = pd.read_json('../twitter/twitter_stock_data.json')

In [15]:
x_train,x_test,x_val,y_train,y_test,y_val = feature_generation(df,stop_words)

In [16]:
sgd = optimizers.SGD()

In [17]:
# Each embedding feature is treated as a seperate channel.

# Each word per filter is a summation of the linear combination of three hundred seperate weights and
# all embedding features.

# Combination is then normalized when passed through ReLu activation.

# At maxpooling stage two neighboring words are compared per feature and the feature of the word contributing the 
# largest value to the feature map is retained and the process repeated until the combination of features 
# from all words contributing the most per tweet remains and its weights used as a tweet generalization. 

model = Sequential()

model.add(Conv1D(300,kernel_size=1,strides=1,input_shape=(14,300)))
model.add(Activation('relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(.2))

model.add(Conv1D(300,kernel_size=1,strides=1))
model.add(Activation('relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(.2))

model.add(Conv1D(300,kernel_size=1,strides=1))
model.add(Activation('relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(.2))

model.add(Flatten())
model.add(Dense(2,activation='softmax'))
model.compile(optimizer=sgd,loss='binary_crossentropy',metrics=['accuracy'])

In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 14, 300)           90300     
_________________________________________________________________
activation_1 (Activation)    (None, 14, 300)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 7, 300)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 7, 300)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 7, 300)            90300     
_________________________________________________________________
activation_2 (Activation)    (None, 7, 300)            0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 3, 300)            0         
__________

In [19]:
model_json = model.to_json()

In [20]:
with open('word_embedding_model.json','w') as f:
    json.dump(model_json,f)
f.close()

In [21]:
early_stopping = EarlyStopping('val_acc',patience=3,verbose=1)

In [22]:
checkpoint= ModelCheckpoint('word_embedding_weights.hdf5','val_acc',save_best_only=True)

In [23]:
history = model.fit(x_train,y_train,epochs=100,callbacks=[early_stopping,checkpoint],\
              validation_data=(x_val,y_val))

Train on 157654 samples, validate on 39414 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 00029: early stopping


In [24]:
model.evaluate(x_test,y_test)



[0.31113756501564316, 0.84990257368349476]

In [25]:
embedding_model = Sequential()

In [26]:
for layer in model.layers[:-1]:
    embedding_model.add(layer)

In [27]:
embedding_model_json = model.to_json()

In [28]:
with open('tweet_embedding_model.json','w') as f:
    json.dump(embedding_model_json,f)
f.close()

In [29]:
embedding_model.save_weights('tweet_embedding_weights.hdf5')