## Imports and Installs

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install tensorflow
!pip install transformers
!pip install keras
!pip install torch

In [None]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
import tensorflow as tf

from transformers import BertTokenizer
from transformers.tokenization_bert import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from sklearn.metrics import classification_report

## Load data

In [None]:
train_data = pd.read_csv('/content/gdrive/MyDrive/Combined_FAANG_percentage_2.2.csv', sep=',')
print("Number of training examples {}".format(len(train_data)))


Drop rows before a particular date, according to the duration considered


In [None]:
train_data.drop(train_data[train_data['Date'] <= '2018-07-20'].index, inplace = True)
train_data

Drop neutrals and replace negative label -1 to 0


In [None]:
train_data.drop(train_data[train_data['label'] == 0].index, inplace = True)
train_data["label"].replace({-1: 0}, inplace=True)

Consider whole data and shuffle it


In [None]:
train_data = train_data.sample(frac=1)
train_data

Drop NaN messages


In [None]:
train_data = train_data.dropna(subset=['message'])
train_data

## Get BERT Embeddings

In [None]:
model1 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2, output_attentions = False, output_hidden_states = False)

Get word embeddings from the model


In [None]:
bert_embeddings1 = list(model1.children())[0]
word_embeddings1 = list(bert_embeddings1.children())[0]
word_embeddings = word_embeddings1.word_embeddings.weight.data.numpy()

In [None]:
print(word_embeddings.shape)

## Split data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data['message'], train_data['label'], test_size=0.1, random_state=42)

Create dataframes for training messages, training labels and testing messages


In [None]:
df_train = pd.DataFrame(X_train,columns=['message'])
df_test = pd.DataFrame(X_test,columns=['message'])
y_train = pd.DataFrame(y_train,columns=['label'])

In [None]:
print("Number of training examples {}".format(len(X_train)))
print("Number of testing examples {}".format(len(X_test)))

In [None]:
df_train = pd.DataFrame(y_train, columns=['label'])
df_train['label'].value_counts()

In [None]:
df_test = pd.DataFrame(y_test,columns=['label'])
df_test['label'].value_counts()

## Prepare data

In [None]:
MAX_SEQUENCE_LENGTH = 160 # size of vector
MAX_VOCAB_SIZE = word_embeddings.shape[0] # number of unique words to use
EMBEDDING_DIM = word_embeddings.shape[1] # size of embedding
VALIDATION_SPLIT = 0.1 # 10% of training data is used for validation

Create BERT Tokenizer


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Pad training data 


In [None]:
train_cnn_data = pad_sequences([tokenizer.convert_tokens_to_ids(tokenizer.tokenize(txt)) for txt in X_train], maxlen = MAX_SEQUENCE_LENGTH, dtype = "long", truncating = "post", padding = "post")

In [None]:
train_cnn_data

Pad testing data


In [None]:
test_cnn_data = pad_sequences([tokenizer.convert_tokens_to_ids(tokenizer.tokenize(txt)) for txt in X_test],maxlen=MAX_SEQUENCE_LENGTH, dtype="long", truncating="post", padding="post")
test_cnn_data

In [None]:
df_train = train_cnn_data
y_tr = y_train.label.values
y_tr

In [None]:
word_index = tokenizer.get_vocab()
print(word_index)

Assign embedding matrix


In [None]:
embedding_matrix = word_embeddings
embedding_matrix

## Convolutional Neural Network (CNN)

In [None]:
from keras.layers import concatenate

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=trainable)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = concatenate(convs, axis = 1)

    # Add one dimensional convolutional network with global maxpooling, instead of 
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)  
    else:
        # Yoon Kim model
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)

    # Feed output to sigmoid layer to compress output between 0 and 1 for binary classification
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [None]:
label_names = ['label']
model = ConvNet(embedding_matrix, MAX_SEQUENCE_LENGTH, MAX_VOCAB_SIZE, EMBEDDING_DIM, 
                len(list(label_names)), False)

Define callbacks


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

Train the model


In [None]:
hist = model.fit(df_train, y_tr, epochs=2, callbacks=callbacks_list, validation_split=0.1, shuffle=True, batch_size=32)

Predict on test set and print classification report


In [None]:
preds = model.predict(test_cnn_data, batch_size=8, verbose=1)
preds

In [None]:
y_pred = (preds > 0.53) * 1.0 # classify prediction above 0.53 as class 1, else class 0
y_pred.sum()

In [None]:
print(classification_report(y_test, y_pred,zero_division=1))