In [1]:
import os
import numpy as np
from datetime import datetime, timedelta
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

from keras.activations import relu
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, Flatten, Conv1D, MaxPooling1D
from keras.layers import Dropout, concatenate
from keras.utils.vis_utils import model_to_dot

from sklearn.metrics import classification_report

from IPython.display import SVG

import random

from collections import namedtuple

from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt

In [102]:
def compute_metrics(raw_predictions, label_encoder):
    # convert raw predictions to class indexes
    threshold = 0.5
    class_predictions = [(x > threshold).astype(int) for x in raw_predictions]

    # select only one class (i.e., the dim in the vector with 1.0 all other are at 0.0)
    class_index = ([np.argmax(x) for x in class_predictions])

    # convert back to original class names
    pred_classes = label_encoder.inverse_transform(class_index)

    # print precision, recall, f1-score report
    print(classification_report(y_test, pred_classes))

def load_fasttext_embeddings():
    glove_dir = '../Data/Processed_Data/glove.6B'
    embeddings_index = {}
    f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'),encoding = "utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index

def create_embeddings_matrix(embeddings_index, vocabulary, embedding_dim=300):
    embeddings_matrix = np.random.rand(len(vocabulary)+1, embedding_dim)
    for i, word in enumerate(vocabulary):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector
    print('Matrix shape: {}'.format(embeddings_matrix.shape))
    return embeddings_matrix


def get_embeddings_layer(embeddings_matrix, name, max_len, trainable=False):
    embedding_layer = Embedding(
        input_dim=embeddings_matrix.shape[0],
        output_dim=embeddings_matrix.shape[1],
        input_length=max_len,
        weights=[embeddings_matrix],
        trainable=trainable,
        name=name)
    return embedding_layer


def get_conv_pool(x_input, max_len, suffix, n_grams=[3,4,5], feature_maps=300):
    branches = []
    for n in n_grams:
        branch = Conv1D(filters=feature_maps, kernel_size=n, activation=relu, name='Conv_'+suffix+'_'+str(n))(x_input)
        branch = MaxPooling1D(pool_size=max_len-n+1, strides=None, padding='valid', name='MaxPooling_'+suffix+'_'+str(n))(branch)
        branch = Flatten(name='Flatten_'+suffix+'_'+str(n))(branch)
        branches.append(branch)
    return branches

In [103]:
def get_cnn_pre_trained_embeddings(embedding_layer, max_len, num_classes):
    # connect the input with the embedding layer
    i = Input(shape=(max_len,), dtype='int32', name='main_input')
    x = embedding_layer(i)

    # generate several branches in the network, each for a different convolution+pooling operation,
    # and concatenate the result of each branch into a single vector
    branches = get_conv_pool(x, max_len, 'static')
    z = concatenate(branches, axis=-1)

    # pass the concatenated vector to the predition layer
    o = Dense(num_classes, activation='sigmoid', name='output')(z)

    model = Model(inputs=i, outputs=o)
    model.compile(loss={'output': 'binary_crossentropy'}, optimizer='adam', metrics=['accuracy'])

    return model

In [140]:
def get_date_features(df):
    df['Month']=df.Date.dt.month
    df['Day']=df.Date.dt.day
    df['Year']=df.Date.dt.year
    df['DayOfWeek']=df.Date.dt.dayofweek
    df['DayOfYear']=df.Date.dt.dayofyear
    df['WeekOfYear']=df.Date.dt.weekofyear
    return df

In [189]:
to_datetime = lambda d: datetime.strptime(d, "%Y-%m-%d")
input_data=pd.read_csv('../Data/Processed_Data/input_data_for_cnn.csv',sep='|',converters={'Date': to_datetime},encoding = "ISO-8859-1")
# input_data=pd.read_csv('../Data/Processed_Data/input_data_for_cnn.csv',sep='|',encoding = "ISO-8859-1")
train_data=input_data[input_data.Date<'2019-01-01']
test_data=input_data[input_data.Date.between('2019-01-01','2019-05-31',inclusive=True)]
validation_data=input_data[input_data.Date>='2019-06-01']
train_data=get_date_features(train_data)
test_data=get_date_features(test_data)
validation_data=get_date_features(validation_data)

train_data.reset_index(inplace=True)
test_data.reset_index(inplace=True)
validation_data.reset_index(inplace=True)
input_data={}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [190]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data.Cleaned_HL)
sequences_train = tokenizer.texts_to_sequences(train_data.Cleaned_HL)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 7140 unique tokens.


In [191]:
max_input_len = max([len(x) for x in train_data.Cleaned_HL])
x_train_data_padded = pad_sequences(sequences_train, maxlen=max_input_len, padding='post', truncating='post')
x_test_data_padded = pad_sequences(tokenizer.texts_to_sequences(test_data.Cleaned_HL), maxlen=max_input_len, padding='post', truncating='post')
x_val_data_padded = pad_sequences(tokenizer.texts_to_sequences(validation_data.Cleaned_HL), maxlen=max_input_len, padding='post', truncating='post')

In [198]:
x_train_with_dt=pd.DataFrame(x_train_data_padded).join(train_data[['Month', 'Day', 'Year', 'DayOfWeek','DayOfYear', 'WeekOfYear']]).to_numpy()
x_test_with_dt=pd.DataFrame(x_test_data_padded).join(train_data[['Month', 'Day', 'Year', 'DayOfWeek','DayOfYear', 'WeekOfYear']]).to_numpy()
x_val_with_dt=pd.DataFrame(x_val_data_padded).join(train_data[['Month', 'Day', 'Year', 'DayOfWeek','DayOfYear', 'WeekOfYear']]).to_numpy()

In [199]:
le = LabelEncoder()
le.fit(train_data.MoveMent)
y_train_encoded=to_categorical(le.transform(train_data.MoveMent), num_classes=None)
y_test_encoded=to_categorical(le.transform(test_data.MoveMent), num_classes=None)
y_val_encoded=to_categorical(le.transform(validation_data.MoveMent), num_classes=None)

In [200]:
print(x_train_data_padded.shape)
print(y_train_encoded.shape)

(14050, 98)
(14050, 3)


In [205]:
embeddings_index = load_fasttext_embeddings()
embeddings_matrix = create_embeddings_matrix(embeddings_index, word_index)
# embedding_layer_static = get_embeddings_layer(embeddings_matrix, 'embedding_layer_static', max_input_len, trainable=False)
# model = get_cnn_pre_trained_embeddings(embedding_layer_static, max_input_len,3)
embedding_layer_static = get_embeddings_layer(embeddings_matrix, 'embedding_layer_static', 104, trainable=False)
model = get_cnn_pre_trained_embeddings(embedding_layer_static, 104,3)

Loaded 400000 word vectors.
Matrix shape: (7141, 300)


In [206]:
# history = model.fit(x=x_train_data_padded, y=y_train_encoded, batch_size=50, epochs=5)
history = model.fit(x=x_train_with_dt, y=y_train_encoded, batch_size=50, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [207]:
# loss, accuracy = model.evaluate(x_train_data_padded, y_train_encoded, verbose=True)
loss, accuracy = model.evaluate(x_train_with_dt, y_train_encoded, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
# loss, accuracy = model.evaluate(x_test_data_padded, y_test_encoded, verbose=False)
loss, accuracy = model.evaluate(x_test_with_dt, y_test_encoded, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.6842


In [208]:
raw_predictions = model.predict(x_test_with_dt)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(le.transform(test_data.MoveMent), class_predictions,labels=[0,1,2],target_names=le.classes_))

              precision    recall  f1-score   support

        down       0.44      0.52      0.47       438
  nomovement       0.00      0.00      0.00         0
          up       0.63      0.53      0.58       643

   micro avg       0.53      0.53      0.53      1081
   macro avg       0.35      0.35      0.35      1081
weighted avg       0.55      0.53      0.54      1081



  'recall', 'true', average, warn_for)


In [33]:
raw_predictions = model.predict(x_test_data_padded)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(le.transform(test_data.MoveMent), class_predictions,labels=[0,1,2],target_names=le.classes_))

              precision    recall  f1-score   support

        down       0.42      0.61      0.50       438
  nomovement       0.00      0.00      0.00         0
          up       0.61      0.42      0.50       643

   micro avg       0.50      0.50      0.50      1081
   macro avg       0.34      0.34      0.33      1081
weighted avg       0.53      0.50      0.50      1081



In [37]:
from sklearn.metrics import confusion_matrix
confusion_matrix(le.transform(test_data.MoveMent), class_predictions)

array([[267, 171],
       [371, 272]], dtype=int64)

In [36]:
help(classification_report)

Help on function classification_report in module sklearn.metrics.classification:

classification_report(y_true, y_pred, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False)
    Build a text report showing the main classification metrics
    
    Read more in the :ref:`User Guide <classification_report>`.
    
    Parameters
    ----------
    y_true : 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) target values.
    
    y_pred : 1d array-like, or label indicator array / sparse matrix
        Estimated targets as returned by a classifier.
    
    labels : array, shape = [n_labels]
        Optional list of label indices to include in the report.
    
    target_names : list of strings
        Optional display names matching the labels (same order).
    
    sample_weight : array-like of shape = [n_samples], optional
        Sample weights.
    
    digits : int
        Number of digits for formatting output floating po