In [1]:
import numpy as np
import pandas as pd
import sys
import os 
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from kerastuner import HyperModel
from kerastuner.tuners import RandomSearch
from sklearn.metrics import classification_report

sys.path.append('../')
from src.utilities.config_ import train_data_path, scrape_data_path, model_path

  from kerastuner import HyperModel


In [2]:
# Set logging level to suppress TensorFlow debug messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

In [3]:
# read csv
train_filename = "finance-dataset.csv"
df = pd.read_csv(os.path.join(train_data_path, train_filename))
df.head()

Unnamed: 0,label,title
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [4]:
# split the data into features (X) and labels (y)
X = df['title']
y = df['label']

# convert labels to numeric format
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
y = y.map(label_mapping)

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# tokenize and pad the sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [6]:
# HyperModel class for Keras Tuner
class SentimentHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        model.add(Embedding(input_dim=5000, output_dim=hp.Int('embedding_output_dim', 64, 256, step=64), input_length=max_length))
        model.add(Bidirectional(LSTM(hp.Int('lstm_units', 32, 128, step=32), return_sequences=True)))
        model.add(Dropout(hp.Float('dropout_rate', 0.3, 0.5, step=0.1)))
        model.add(Bidirectional(LSTM(hp.Int('lstm_units', 32, 128, step=32))))
        model.add(Dropout(hp.Float('dropout_rate', 0.3, 0.5, step=0.1)))
        model.add(Dense(hp.Int('dense_units', 32, 128, step=32), activation='relu'))
        model.add(Dropout(hp.Float('dropout_rate', 0.3, 0.5, step=0.1)))
        model.add(Dense(3, activation='softmax'))

        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='LOG')),
                      loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model

# Initialize the tuner
tuner = RandomSearch(
    SentimentHyperModel(),
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='my_dir',
    project_name='sentiment_analysis'
)

# Perform the hyperparameter search
tuner.search(X_train_pad, y_train, epochs=10, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=3), ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-5)])

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)

# Build the model with the best hyperparameters and train it
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(X_train_pad, y_train, epochs=30, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=3), ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-5)])

Reloading Tuner from my_dir/sentiment_analysis/tuner0.json
{'embedding_output_dim': 256, 'lstm_units': 96, 'dropout_rate': 0.5, 'dense_units': 128, 'learning_rate': 0.00222084508546812}




Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30


In [7]:
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 70, 256)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 70, 192)          271104    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 70, 192)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 192)              221952    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 192)               0         
                                                                 
 dense (Dense)               (None, 128)               2

In [8]:
# Evaluate the model on the test set
loss, accuracy = best_model.evaluate(X_test_pad, y_test)

# Generate a classification report
y_pred = best_model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)

print(f'Test Accuracy: {accuracy}')
print(classification_report(y_test, y_pred_classes, target_names=label_mapping.keys()))

Test Accuracy: 0.7556701302528381
              precision    recall  f1-score   support

    negative       0.68      0.58      0.63       110
     neutral       0.79      0.87      0.82       571
    positive       0.71      0.61      0.65       289

    accuracy                           0.76       970
   macro avg       0.72      0.68      0.70       970
weighted avg       0.75      0.76      0.75       970



In [9]:
# save model
tensorflow_file = "tensorflow_model"
best_model.save(os.path.join(model_path, tensorflow_file))
print(f"Model saved to {tensorflow_file}")



Model saved to tensorflow_model


In [11]:
loaded_model = tf.keras.models.load_model(os.path.join(model_path, tensorflow_file))

# Evaluate the model on the test set
loss, accuracy = loaded_model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.7556701302528381
