## We import the required bookstores

In [43]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## We visualize the dataset

In [44]:
df_train = pd.read_csv("sent_train.csv")
df_valid = pd.read_csv("sent_valid.csv")

## Prepoccesing

In [45]:
sentiments = {"LABEL_0": "Bearish", "LABEL_1": "Bullish", "LABEL_2": "Neutral"}

In [None]:
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'http\S+', '', text)  #delete urls
    text = re.sub(r'[^a-zA-Z\s]', '', text) #delete spacial characters
    return text.strip()

In [47]:
df_train['text'] = df_train['text'].astype(str).apply(clean_text)
df_valid['text'] = df_valid['text'].astype(str).apply(clean_text)

## Training

In [48]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df_train['text'])

X_train = tokenizer.texts_to_sequences(df_train['text'])
X_valid = tokenizer.texts_to_sequences(df_valid['text'])

## Padding

In [49]:
max_length = 50
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_valid = pad_sequences(X_valid, maxlen=max_length, padding='post')

## Label codifications

In [50]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['label'])
y_valid = label_encoder.transform(df_valid['label'])

## Labels to one-hot

In [51]:
y_train = to_categorical(y_train, num_classes=3)
y_valid = to_categorical(y_valid, num_classes=3)

## LSTM building

In [52]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_length),
    SpatialDropout1D(0.1),
    LSTM(200, return_sequences=True, dropout=0.1),  # LSTM aditional layer
    LSTM(100, dropout=0.1),
    Dense(3, activation='softmax')
])



## Compilation

In [53]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train model

In [54]:
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_valid, y_valid))

Epoch 1/10


[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 65ms/step - accuracy: 0.6248 - loss: 0.9351 - val_accuracy: 0.6558 - val_loss: 0.8788
Epoch 2/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 66ms/step - accuracy: 0.6473 - loss: 0.8923 - val_accuracy: 0.6558 - val_loss: 0.8787
Epoch 3/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 64ms/step - accuracy: 0.6385 - loss: 0.9034 - val_accuracy: 0.6558 - val_loss: 0.8799
Epoch 4/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 69ms/step - accuracy: 0.6417 - loss: 0.8997 - val_accuracy: 0.6558 - val_loss: 0.8790
Epoch 5/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 68ms/step - accuracy: 0.6521 - loss: 0.8866 - val_accuracy: 0.6558 - val_loss: 0.8796
Epoch 6/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 70ms/step - accuracy: 0.6409 - loss: 0.8993 - val_accuracy: 0.6558 - val_loss: 0.8817
Epoch 7/10
[1m150/150[0m 

## Evaluate

In [55]:
y_pred = model.predict(X_valid)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_valid, axis=1)

print(classification_report(y_true_classes, y_pred_classes, target_names=sentiments.values()))

[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step
              precision    recall  f1-score   support

     Bearish       0.00      0.00      0.00       347
     Bullish       0.39      0.28      0.32       475
     Neutral       0.71      0.93      0.81      1566

    accuracy                           0.67      2388
   macro avg       0.37      0.40      0.38      2388
weighted avg       0.55      0.67      0.59      2388



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


For better metrics I will change the parameters increasing the LSTM units adding more layes or adjusting the dropout for excessive looss