#### Loading Data

In [None]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import os

base_dir = os.getcwd()
dataset_path = os.path.join(base_dir, "covid_tweets_with_sentiments_2021-08-26.csv")
df = pd.read_csv(dataset_path)
print(df.shape)
df.head()

In [None]:
df = df[df['sentiment'].notna()]
df.sentiment.value_counts()

#### Feature Engineering

In [None]:
# Assigning numerical values and storing in another column
labelencoder = LabelEncoder()
df['sentiment_label'] = labelencoder.fit_transform(df['sentiment'])
df.head()

In [None]:
df = df[df['cleaned_text'].notna()]
df['input'] = df['user_name'] + " " + df['user_location'] + " " + df['cleaned_text']
df = df[df['input'].notna()]
df.sentiment_label.value_counts()

In [None]:
df['input'].head()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np


def to_one_hot(labels, dimension=3):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results


maxlen = 280
max_words = 40000
tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(df['input'])
sequences = tokenizer.texts_to_sequences(df['input'])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
X = pad_sequences(sequences, maxlen=maxlen)


one_hot_labels = to_one_hot(df['sentiment_label'])
labels = np.asarray(one_hot_labels)


print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
labels = labels[indices]


# splitting dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.1, random_state=0)

In [None]:
print(labels.shape)
print(y_test.shape)

In [None]:
y_train.shape

#### Building model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import TimeDistributed

embedding_dim = 300

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Bidirectional(layers.LSTM(32,return_sequences = True, dropout=0.2, recurrent_dropout=0.2)))
model.add(Bidirectional(layers.LSTM(32, return_sequences= True)))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

#### Training model

In [None]:
# define hyperparameters
valid_split = 0.2
batch_size_no = 16
epochs_no = 5

In [None]:
history = model.fit(X_train, 
                    y_train, 
                    epochs = epochs_no, 
                    batch_size = batch_size_no,
                    validation_split = valid_split, 
                    shuffle=True)

#### Saving model 

In [None]:
model_path = os.path.join(base_dir, "sentiment_analysis_bi_lstm_{}_observations_{}_epochs_{}_batchsize.h5".format(df.shape[0], epochs_no, batch_size_no))
model.save(model_path)

#### Evaluating model 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

y_val_pred = model.predict_classes(X_test)

rounded_labels = np.argmax(y_test, axis=1)

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(rounded_labels, y_val_pred)
print('Accuracy: %f' % accuracy)

# precision tp / (tp + fp)
precision = precision_score(rounded_labels, y_val_pred, average = 'weighted')
print('Precision: %f' %precision)

# recall: tp / (tp + fn)
recall = recall_score(rounded_labels, y_val_pred, average = 'weighted')
print('Recall: %f' %recall)

# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(rounded_labels, y_val_pred, average = 'weighted')
print('F1: %f' %f1)