In [1]:
import pandas as pd
import numpy as np
import keras
import os
import tensorflow as tf
from keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding, Dense, Bidirectional, LSTM, Dropout
from keras.layers import Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.initializers import Constant
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
from keras.models import Model

Using TensorFlow backend.


In [2]:
EMBEDDING_DIM = 100
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.1

In [3]:
text_path = "tweet&price.csv"
data = pd.read_csv(text_path)

data = data.sample(frac=1).reset_index(drop=True)

In [4]:
tweets = data["Tweet"].to_numpy(dtype="str")

In [5]:
change = (data["change"].to_numpy(dtype="float"))*100

In [6]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)

In [7]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 19936 unique tokens.


In [8]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [9]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
change = change[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [10]:
x_train = data[:-num_validation_samples]
y_train = change[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = change[-num_validation_samples:]

In [11]:
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join("glove.6B/glove.6B.100d.txt")) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [12]:
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [13]:
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [14]:
model = Sequential()

model.add(embedding_layer)

model.add(Bidirectional(LSTM(128), input_shape=(MAX_SEQUENCE_LENGTH, len(x_train))))
model.add(Dropout(0.2))
model.add(Dense(len(x_train), activation='softmax'))

model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

optimizer = tf.keras.optimizers.RMSprop(0.001)

model.compile(loss='mse',
            optimizer=optimizer,
            metrics=['mae', 'mse'])

In [15]:
EPOCHS = 1000

history = model.fit(
  x_train, y_train,
  epochs=EPOCHS, validation_split = 0.1, verbose=0,
  callbacks=[tfdocs.modeling.EpochDots()])


Epoch: 0, loss:7658.2729,  mae:57.2852,  mse:7658.2715,  val_loss:6351.2564,  val_mae:56.4120,  val_mse:6351.2563,  
....................................................................................................
Epoch: 100, loss:265.9584,  mae:11.9047,  mse:265.9584,  val_loss:9396.2800,  val_mae:73.6623,  val_mse:9396.2793,  
....................................................................................................
Epoch: 200, loss:107.9770,  mae:7.6939,  mse:107.9770,  val_loss:8633.1306,  val_mae:69.0313,  val_mse:8633.1299,  
....................................................................................................
Epoch: 300, loss:64.5452,  mae:5.7763,  mse:64.5452,  val_loss:8120.2390,  val_mae:66.7696,  val_mse:8120.2393,  
....................................................................................................
Epoch: 400, loss:36.0509,  mae:4.3429,  mse:36.0509,  val_loss:8370.0838,  val_mae:67.9585,  val_mse:8370.0840,  
.................

In [16]:
model.save('model.pb')

  'TensorFlow optimizers do not '
