In [1]:
import functools

import numpy as np
import tensorflow as tf

In [2]:
LABEL_COLUMN = 'Close'
BATCH_SIZE = 10
WINDOW_SIZE = 5
EPOCHS = 1
train_file_path = 'data/all_data.csv'

def get_dataset(file_path, **kwargs):
  dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      shuffle=False,
      #prefetch_buffer_size=2,
      ignore_errors=True, 
      **kwargs)
  return dataset

NUMERIC_FEATURES = [
    #'Close',
    'Volume_BTC',
    'replies_sum',
    'replies_mean',
    'likes_sum',
    'likes_mean',
    'retweets_sum',
    'retweets_mean',
    'Negative_mean',
    'Negative_replies_mean',
    'Negative_likes_mean',
    'Negative_retweets_mean',
    'Neutral_mean',
    'Neutral_replies_mean',
    'Neutral_likes_mean',
    'Neutral_retweets_mean',
    'Compound_mean',
    'Compound_replies_mean',
    'Compound_likes_mean',
    'Compound_retweets_mean',
    'Polarity_mean',
    'Polarity_replies_mean',
    'Polarity_likes_mean',
    'Polarity_retweets_mean',
    'Subjectivity_mean',
    'Subjectivity_replies_mean',
    'Subjectivity_likes_mean',
    'Subjectivity_retweets_mean',
]

BOOLEAN_FEATURES = [
    'no_tweets',
    'no_data'
]

CATEGORICAL_FEATURES = []

features_columns = NUMERIC_FEATURES + CATEGORICAL_FEATURES + BOOLEAN_FEATURES
select_columns = [LABEL_COLUMN] + features_columns
#select_columns = [LABEL_COLUMN] + NUMERIC_FEATURES# + CATEGORICAL_FEATURES + BOOLEAN_FEATURES

ds = get_dataset(train_file_path, field_delim='\t', select_columns=select_columns, batch_size=WINDOW_SIZE, num_epochs=EPOCHS, label_name=LABEL_COLUMN)

In [3]:
class PackNumericFeatures(object):
  def __init__(self, names):
    self.names = names

  def __call__(self, features, labels):
    numeric_features = [features.pop(name) for name in self.names]
    numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
    numeric_features = tf.stack(numeric_features, axis=-1)
    #features['numeric'] = numeric_features

    #return features, labels
    return numeric_features, labels

ds = ds.map(
    PackNumericFeatures(features_columns)
)

In [4]:
def make_window_dataset(ds, window_size=5, shift=1, stride=1):
  windows = ds.unbatch().window(window_size, shift=shift, stride=stride)

  def sub_to_batch(sub):
    return sub.batch(window_size, drop_remainder=True)

  windows = windows.flat_map(sub_to_batch)
  return windows

In [5]:
labels_ds = ds.unbatch().skip(WINDOW_SIZE).map(lambda x,y: y).batch(1)
window_feature_ds = make_window_dataset(ds.map(lambda x,y: x), window_size=WINDOW_SIZE, shift=1, stride=1)

final_ds = tf.data.Dataset.zip((window_feature_ds, labels_ds)).batch(BATCH_SIZE, drop_remainder=True).shuffle(1000)

In [6]:
for example, label in final_ds.take(1):
    print(label)
    print(example)

tf.Tensor(
[[ 0.  ]
 [ 0.  ]
 [ 0.  ]
 [ 0.  ]
 [ 0.  ]
 [-0.32]
 [ 0.  ]
 [ 0.  ]
 [ 0.  ]
 [ 0.  ]], shape=(10, 1), dtype=float32)
tf.Tensor(
[[[12.066947  0.        0.       ...  0.        0.        0.      ]
  [12.066947  0.        0.       ...  0.        0.        0.      ]
  [12.066947  0.        0.       ...  0.        0.        0.      ]
  [12.066947  0.        0.       ...  0.        0.        0.      ]
  [12.066947  1.        0.2      ...  0.        0.        0.      ]]

 [[12.066947  0.        0.       ...  0.        0.        0.      ]
  [12.066947  0.        0.       ...  0.        0.        0.      ]
  [12.066947  0.        0.       ...  0.        0.        0.      ]
  [12.066947  1.        0.2      ...  0.        0.        0.      ]
  [12.066947  0.        0.       ...  0.        0.        0.      ]]

 [[12.066947  0.        0.       ...  0.        0.        0.      ]
  [12.066947  0.        0.       ...  0.        0.        0.      ]
  [12.066947  1.        0.2      ...

In [7]:
import pandas as pd

desc = pd.read_csv(train_file_path, sep='\t', usecols=NUMERIC_FEATURES).describe()

MEAN = np.array(desc.T['mean'])
STD = np.array(desc.T['std'])

def normalize_numeric_data(data, mean, std):
  # Center the data
  return (data-mean)/std

normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

#numeric_column = tf.feature_column.numeric_column('numeric', normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)])
numeric_column = tf.feature_column.sequence_numeric_column('numeric', normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]

In [8]:
CATEGORIES = {
    # 'class' : ['First', 'Second', 'Third']
}
categorical_columns = []
for feature, vocab in CATEGORIES.items():
  #cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
  #      key=feature, vocabulary_list=vocab)
  cat_col = tf.feature_column.sequence_categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
  categorical_columns.append(tf.feature_column.indicator_column(cat_col))

In [9]:
boolean_columns = []
for feature in BOOLEAN_FEATURES:
    #bool_col = tf.feature_column.numeric_column(key=feature, default_value=0, dtype=tf.int8)
    bool_col = tf.feature_column.sequence_numeric_column(key=feature, default_value=0, dtype=tf.int8)
    boolean_columns.append(bool_col)

In [10]:
#preprocessing_layer = tf.keras.layers.DenseFeatures(numeric_columns+categorical_columns+boolean_columns)
#preprocessing_layer = tf.keras.experimental.SequenceFeatures(numeric_columns+categorical_columns+boolean_columns)
preprocessing_layer = tf.keras.experimental.SequenceFeatures(numeric_columns)

feature_inputs = {}

for x, y in ds.take(1):
    for key, value in x.items():
        feature_inputs.update(
            {
                key: tf.keras.Input(value.shape, dtype=value.dtype, name=key)
            }
        )
        print(key, value.shape, value.dtype)

print(feature_inputs)

#inputs = preprocessing_layer(feature_inputs)

AttributeError: 'tensorflow.python.framework.ops.EagerTensor' object has no attribute 'items'

In [13]:
model = tf.keras.models.Sequential(
    [
        #preprocessing_layer,
        #tf.keras.layers.Reshape((WINDOW_SIZE, len(select_columns))),
        tf.keras.layers.LSTM(32,
                               return_sequences=True,),
                               #input_shape=(30, BATCH_SIZE)),
        tf.keras.layers.LSTM(16, activation='relu'),
        tf.keras.layers.Dense(1)
    ]
)
model.compile(optimizer=tf.keras.optimizers.RMSprop(clipvalue=1.0), loss='mae')

In [14]:
model.fit(final_ds, verbose=1)



<tensorflow.python.keras.callbacks.History at 0x1781ffb8b48>

In [15]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                multiple                  7936      
_________________________________________________________________
lstm_3 (LSTM)                multiple                  3136      
_________________________________________________________________
dense_1 (Dense)              multiple                  17        
Total params: 11,089
Trainable params: 11,089
Non-trainable params: 0
_________________________________________________________________
