In [None]:
# install required libraries
! pip install tensorflow scikit-learn pandas numpy pickle5



In [None]:
# import necessary libraries
from tensorflow import keras
from google.colab import files, drive  # Importing libraries for file handling and drive mounting
import os  # importing the os module for operating system related functionalities
import numpy as np  # importing numpy for numerical computations
import pandas as pd  # importing pandas for data manipulation and analysis
import tensorflow as tf  # importing tensorflow for deep learning
from sklearn.model_selection import KFold # importing kfold for cross-validation
from sklearn.metrics import accuracy_score # importing accuracy score for cross-validation
from tensorflow.keras.preprocessing.text import Tokenizer  # importing Tokenizer for text preprocessing
from tensorflow.keras.preprocessing.sequence import pad_sequences  # importing pad_sequences for sequence padding
from tensorflow.keras.models import Sequential  # importing Sequential model for building deep learning models
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization  # importing different layers for building the model
from tensorflow.keras.utils import to_categorical  # importing to_categorical for one-hot encoding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau  # importing callbacks for model training
from tensorflow.keras.regularizers import l2  # importing l2 regularization for model regularization
from sklearn.model_selection import train_test_split  # importing train_test_split for splitting the data
from sklearn.metrics import classification_report, confusion_matrix  # importing metrics for model evaluation
import pickle  # importing pickle for object serialization

In [None]:
# mounting Google Drive to access files
drive.mount('/content/drive')

# setting the base directory for saving model and tokenizer
base_dir = '/content/drive/My Drive/Colab Notebooks/MyModel/'

# setting the paths for saving the model/tokenizer
model_path = os.path.join(base_dir, 'twitter_sa_model.keras')
tokenizer_path = os.path.join(base_dir, 'tokenizer.pickle')

# creating the base directory if it doesn't exist
os.makedirs(base_dir, exist_ok=True)

# setting the path for the dataset
file_path = '/content/drive/My Drive/Colab Notebooks/MyModel/cleaned_dataset.h5'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# reading the dataset into a pandas DataFrame
df = pd.read_hdf(file_path, key='df')

# extracting the features from the DataFrame
features = df['cleaned_text']

# extracting the target labels from the DataFrame
target_labels = df['target']

# initializing the tokenizer with a maximum vocabulary size of 20000
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')

# fitting the tokenizer on the features
tokenizer.fit_on_texts(features)

In [None]:
# preprocessing function
def preprocess_data(features, tokenizer, max_len=100):
    # convert features into sequences
    sequences = tokenizer.texts_to_sequences(features)
    # pad the sequences to a fixed length of 100
    padded_sequences = pad_sequences(sequences, maxlen=max_len, truncating='post')
    return padded_sequences

In [None]:
# preprocess the data
padded_sequences = preprocess_data(features, tokenizer)

# one-hot encoding the target labels
labels = to_categorical(target_labels, num_classes=2)

# splitting the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [None]:
# initializing an empty dictionary for word embeddings
embedding_index = {}

# opening the file containing word embeddings
with open('/content/drive/MyDrive/SeniorPortfolio/glove.twitter.27B.200d.txt', 'r', encoding='utf-8') as f:
    # iterating over each line in the file
    for line in f:
        # splitting the line into values
        values = line.split()
        # extracting the word
        word = values[0]
        # converting the values into a numpy array
        coefs = np.asarray(values[1:], dtype='float32')
        # storing the word embedding in the dictionary
        embedding_index[word] = coefs

# setting the dimensionality of the word embeddings
embedding_dim = 200

# setting the maximum number of words in the vocabulary
max_words = 20000

# initializing the embedding matrix with zeros
embedding_matrix = np.zeros((max_words, embedding_dim))

In [None]:
# iterating over each word in the tokenizer's word index
for word, i in tokenizer.word_index.items():
    # checking if the word index is within the maximum number of words
    if i < max_words:
        # getting the word embedding vector
        embedding_vector = embedding_index.get(word)
        # checking if the word has an embedding vector
        if embedding_vector is not None:
            # storing the word embedding vector in the embedding matrix
            embedding_matrix[i] = embedding_vector

In [None]:
# creating a sequential model
model = Sequential([
    # adding an embedding layer with pre-trained word embeddings
    Embedding(max_words, embedding_dim, input_length=100, weights=[embedding_matrix], trainable=True),
    # adding a 1D convolutional layer with 64 filters and a kernel size of 5
    Conv1D(64, 5, activation='relu', padding='same'),
    # adding batch normalization
    BatchNormalization(),
    # adding another 1D convolutional layer with 128 filters and a kernel size of 5
    Conv1D(64, 5, activation='relu', padding='same'),
    # adding batch normalization
    BatchNormalization(),
    # adding global max pooling
    GlobalMaxPooling1D(),
    # adding a dense layer with 64 units and ReLU activation
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    # adding dropout regularization
    Dropout(0.6),
    # adding a dense layer with 2 units and softmax activation
    Dense(2, activation='softmax')
])

In [None]:
# compiling the model with Adam optimizer, categorical crossentropy loss, and accuracy metric
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# printing the summary of the model
model.summary()

In [None]:
# initializing early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=2, verbose=1, restore_best_weights=True)

# initializing model checkpoint callback
model_checkpoint = ModelCheckpoint(filepath=os.path.join(base_dir,'best_sentiment_analysis_model.keras'), save_best_only=True, monitor='val_accuracy', verbose=1)

# initializing reduce learning rate callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-5)

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, BatchNormalization, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# *** NOTE ***
# this current build focuses on hyperparameter tuning for the
# final model's training
# define search space
space = {
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.7),
    'l2_regularization': hp.uniform('l2_regularization', 0.0001, 0.01),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.01)),
}

def objective(space):
    print(f"Evaluating with hyperparameters: {space}")
    model = Sequential([
        Embedding(max_words, embedding_dim, input_length=100, weights=[embedding_matrix], trainable=True),
        Conv1D(64, 5, activation='relu', padding='same'),
        BatchNormalization(),
        Conv1D(64, 5, activation='relu', padding='same'),
        BatchNormalization(),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu', kernel_regularizer=l2(space['l2_regularization'])),
        Dropout(space['dropout_rate']),
        Dense(2, activation='softmax')
    ])

    # compile model
    model.compile(optimizer=Adam(learning_rate=space['learning_rate']),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # splitting data into training and validation set
    x_train, x_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

    # creating a smaller subset for hyperparameter tuning
    subset_size = int(0.2 * len(x_train))  # Using 20% of the training data
    x_train_subset = x_train[:subset_size]
    y_train_subset = y_train[:subset_size]


    # fit the model
    history = model.fit(x_train_subset, y_train_subset, validation_data=(x_val, y_val), epochs=10, batch_size=32, verbose=2, callbacks=[early_stopping, reduce_lr])

    # calculate the minimum validation loss
    min_val_loss = min(history.history['val_loss'])

    # print max accuracy
    max_acc = max(history.history['val_accuracy'])
    print("Max accuracy: ", max_acc)

    return {'loss': min_val_loss, 'status': STATUS_OK}

# trials object to store each experiment's results
trials = Trials()

# run the optimization
best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=20,
                        trials=trials)

print("Best Hyperparameters:", best_hyperparams)


In [None]:
# evaluating the model on the test set
loss, accuracy = model.evaluate(x_test, y_test)

# printing the test accuracy
print(f'Test Accuracy: {accuracy}')

# predicting the labels for the test set
y_pred = np.argmax(model.predict(x_test), axis=1)

# getting the true labels for the test set
y_true = np.argmax(y_test, axis=1)

# printing the classification report
print(classification_report(y_true, y_pred, target_names=['Class1', 'Class2']))

# printing the confusion matrix
print(confusion_matrix(y_true, y_pred))

In [None]:
# saving the model
model.save(model_path)

# saving the tokenizer
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# loading the saved model
model = keras.models.load_model(model_path)

# loading the saved tokenizer
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
# defining a function to predict sentiment
def predict_sentiment(text):
    # converting the text into a sequence
    text_sequence = tokenizer.texts_to_sequences([text])
    # padding the sequence
    text_sequence = pad_sequences(text_sequence, maxlen=100)
    # predicting the sentiment
    predicted_rating = model.predict(text_sequence)[0]
    # getting the index of the predicted sentiment
    predicted_index = np.argmax(predicted_rating)
    # mapping the index to sentiment label
    sentiment = 'Positive' if predicted_index == 1 else 'Negative'
    # getting the probability of the predicted sentiment
    probability = predicted_rating[predicted_index]
    # returning the probability and sentiment
    return f"{probability:.2f} {sentiment}"

In [None]:
# defining a softmax function with temperature
def softmax(x, temperature=1.0):
    # computing the exponential of the input with temperature adjustment
    e_x = np.exp((x - np.max(x)) / temperature)
    # returning the softmax probabilities
    return e_x / e_x.sum(axis=-1)

In [None]:
# defining a function to predict sentiment with temperature adjustment
def predict_sentiment_with_temperature(text, temperature=1.0):
    # converting the text into a sequence
    text_sequence = tokenizer.texts_to_sequences([text])
    # padding the sequence
    text_sequence = pad_sequences(text_sequence, maxlen=100)
    # predicting the sentiment
    predicted_rating = model.predict(text_sequence)[0]
    # applying softmax with temperature adjustment
    predicted_rating = softmax(predicted_rating, temperature=temperature)
    # getting the index of the predicted sentiment
    predicted_index = np.argmax(predicted_rating)
    # mapping the index to sentiment label
    sentiment = 'Positive' if predicted_index == 1 else 'Negative'
    # getting the probability of the predicted sentiment
    probability = predicted_rating[predicted_index]
    # returning the probability and sentiment
    return f"{probability:.2f} {sentiment}"

In [None]:
# setting the input text
text_input = "hahahahaha"

# predicting the sentiment of the input text
predicted_sentiment = predict_sentiment(text_input)

# predicting the sentiment of the input text with temperature adjustment
predicted_sentiment_with_temp = predict_sentiment_with_temperature(text_input)

# printing the predicted sentiment
print(predicted_sentiment)

# printing the predicted sentiment with temperature adjustment
print(predicted_sentiment_with_temp)