In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import re


In [None]:
import joblib
import sys

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pylab as plt

data_directory = '../data/'


In [None]:
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop,Adam
from keras.utils import np_utils


batch_size = 32
nb_classes = 2
nb_epoch = 5


In [None]:
# LSTM for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense,Input
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
from keras.layers import Embedding, LSTM

import json



In [None]:
posts_raw_cleaned = pd.read_csv(os.path.join(data_directory,
                                             'posts_raw_cleaned', 
                                             'posts_raw_cleaned.csv'))

# Drop null

posts_raw_cleaned = posts_raw_cleaned[posts_raw_cleaned['body'].notnull()]

# posts_raw_cleaned = posts_raw_cleaned[0:5000]

texts = list(posts_raw_cleaned['body'])

labels = posts_raw_cleaned['total_payout_value']

In [None]:
steem_counts = posts_raw_cleaned['body'].str.lower().str.count('steem')

In [None]:
posts_raw_cleaned['number of steem counts'] = steem_counts

In [None]:
languages = []
for language in posts_raw_cleaned['body_language']:
    if (language != '[]') & pd.notnull(language):
        languages.append(json.loads(language)[0]['language'])
    else:
        languages.append('unknown')
        
posts_raw_cleaned['language'] = languages        

In [None]:
posts_raw_cleaned['author_reputation_scaled'] = (posts_raw_cleaned['author_reputation'] + 0.0)/(10**14)

In [None]:
labels = (posts_raw_cleaned['total_payout_value'] >  np.median(posts_raw_cleaned['total_payout_value'])).astype(int)

# labels = (labels > labels.mean()).astype(int).values

In [None]:
MAX_SEQUENCE_LENGTH = 300
MAX_NB_WORDS = 5000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2



In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))


In [None]:
from keras.utils.np_utils import to_categorical

In [None]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Link to download : http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
GLOVE_DIR = os.path.join('..','word2vec_models','glove.6B')
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


In [None]:
embedding_matrix.shape

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

labels = ((posts_raw_cleaned['total_payout_value'] >  np.median(posts_raw_cleaned['total_payout_value']))
          .astype(int)).values


VALIDATION_SPLIT = 0.33
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)


features = posts_raw_cleaned.ix[:,['number of body tags',
                                   'number of body urls',
                                   'number of image urls',
                                   'number of body mentions',
                                   'number of image urls',
                                   'number of youtube urls',
                                   'language',
                                   'author_reputation_scaled',
                                   'number of steem counts']]

features = pd.get_dummies(features)
features = ss.fit_transform(features)
number_of_features = features.shape[1]
x_train_features = features[:-nb_validation_samples]
x_values_features = features[-nb_validation_samples:]


In [None]:
from keras import backend as K
x_train = K.cast_to_floatx(x_train)
x_val = K.cast_to_floatx(x_val)

x_train_features = K.cast_to_floatx(x_train_features)
x_values_features = K.cast_to_floatx(x_values_features)

In [None]:
import json
import io, json
import keras
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs)
        with open("LTSM-2.txt", "w") as text_file:
            text = ''
            for dicts in self.losses:
                for key in dicts.keys():
                    text += key + ':' + str(dicts[key]) + ', '
                text += '\n'
            text_file.write(text)

In [None]:
from keras.layers import Merge



In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
left_branch = Sequential()
left_branch.add(embedding_layer)
left_branch.add(LSTM(100, return_sequences=False,activation = 'softsign'))
left_branch.add(Dense(2, activation='sigmoid'))

right_branch = Sequential()
right_branch.add(Dense(50, input_dim=number_of_features, activation='relu'))
right_branch.add(Dense(20, activation='relu'))

# Mege branchs
merged = Merge([left_branch, right_branch], mode='concat')

final_model = Sequential()
final_model.add(merged)
final_model.add(Dense(y_train.shape[1], activation='softmax'))

final_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'],)


In [None]:

callback = LossHistory()

history = final_model.fit([x_train, x_train_features], 
                          y_train,callbacks=[callback],
                          validation_data=([x_val, x_values_features],y_val),
                          nb_epoch=20)  # we pass one data array per model input

In [None]:
joblib.dump(history.history,os.path.join('../images','first_run' + '_100_history'))


In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
for W_regularizer in [0, 0.25,0.5,0.75]:
    for U_regularizer in [0, 0.25,0.5,0.75]:
        for dropout_W in [0, 0.1,0.25]:
            for dropout_U in [0,0.1,0.25]:
                model2 = Sequential()
                model2.add(embedding_layer)
                model2.add(LSTM(100, return_sequences=False,
                                     W_regularizer = W_regularizer,
                                     U_regularizer = U_regularizer,
                                     dropout_W = dropout_W,
                                     dropout_U = dropout_U
                               ))
                model2.add(Dense(2, activation='sigmoid'))
                model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
                history = model2.fit(x_train, y_train, nb_epoch=50, batch_size=32,
                                     validation_data=(x_val, y_val))
                
                params ='_'.join([str(W_regularizer),
                                  str(U_regularizer),
                                  str(dropout_W),
                                  str(dropout_U)])
                
                joblib.dump(history.history,os.path.join('../images',params + '_100_history'))



In [None]:
y_train

In [None]:
fig,ax = plt.subplots(1,1,figsize = (8,8))
ax.plot(history.history['loss'], label = 'loss')
ax.plot(history.history['acc'], label = 'acc')
ax.plot(history.history['val_acc'], label = 'val_acc')



ax.legend()
ax.set_title('LSTM - dropout_W = 0.1, dropout_U = 0.1')
fig.savefig('results_multi_dropout_5_5.png')

## history.history