# Imports

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import numpy as np
from numpy.random import seed
seed(1)
import matplotlib.pyplot as plt
import pandas as pd
import string
import os
import shutil
import re

#tensorflow = library specializing in neural networks
import tensorflow as tf

#keras = python interface for neural networks
#runs on top of tensorflow (tensorflow is backend)
#more use friendly

from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Embedding
from tensorflow.keras.layers import TextVectorization
from keras.optimizers import Adam

from sklearn.linear_model import LinearRegression

---

## First cells copied from this tutorial: https://www.tensorflow.org/text/guide/word_embeddings

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 0us/step


['test', 'train', 'imdbEr.txt', 'README', 'imdb.vocab']

In [None]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['pos',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_unsup.txt',
 'unsup',
 'urls_pos.txt',
 'neg',
 'labeledBow.feat']

In [None]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [None]:
batch_size = 1024
seed = 123
train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [None]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i])
        print()

0 b"Oh My God! Please, for the love of all that is holy, Do Not Watch This Movie! It it 82 minutes of my life I will never get back. Sure, I could have stopped watching half way through. But I thought it might get better. It Didn't. Anyone who actually enjoyed this movie is one seriously sick and twisted individual. No wonder us Australians/New Zealanders have a terrible reputation when it comes to making movies. Everything about this movie is horrible, from the acting to the editing. I don't even normally write reviews on here, but in this case I'll make an exception. I only wish someone had of warned me before I hired this catastrophe"

1 b'This movie is SOOOO funny!!! The acting is WONDERFUL, the Ramones are sexy, the jokes are subtle, and the plot is just what every high schooler dreams of doing to his/her school. I absolutely loved the soundtrack as well as the carefully placed cynicism. If you like monty python, You will love this film. This movie is a tad bit "grease"esk (withou

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)



In [None]:
embedding_dim=16

model = Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name="embedding"),
    SimpleRNN(8),
    Dense(1, activation='sigmoid')
])



In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15
)



Epoch 1/15


  output, from_logits = _get_logits(


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 176ms/step - accuracy: 0.4969 - loss: 0.6979 - val_accuracy: 0.5242 - val_loss: 0.6913
Epoch 2/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 153ms/step - accuracy: 0.6491 - loss: 0.6681 - val_accuracy: 0.5428 - val_loss: 0.6883
Epoch 3/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 236ms/step - accuracy: 0.6971 - loss: 0.6405 - val_accuracy: 0.5584 - val_loss: 0.6844
Epoch 4/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 157ms/step - accuracy: 0.7510 - loss: 0.5960 - val_accuracy: 0.5700 - val_loss: 0.6822
Epoch 5/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 155ms/step - accuracy: 0.7959 - loss: 0.5448 - val_accuracy: 0.5700 - val_loss: 0.6890
Epoch 6/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 240ms/step - accuracy: 0.8318 - loss: 0.4981 - val_accuracy: 0.5802 - val_loss: 0.6932
Epoch 7/15
[1m20/20[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c1ade7998a0>

In [None]:
# from tensorflow.keras.models import load_model

# # Save the trained model

model_save_path = 'model.h5'
model.save(model_save_path)
print(f"Model saved at {model_save_path}")


# Display model summary
model.summary()




Model saved at model.h5


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the saved model
model = load_model('/content/model.h5')

# Initialize and fit a tokenizer (or load a pre-trained tokenizer if available)
tokenizer = Tokenizer(num_words=10000)  # Replace with the actual vocabulary size used during training
# Assuming you trained on a dataset, you would have fit the tokenizer like this:
# tokenizer.fit_on_texts(training_texts)

# Define max sequence length (should match the one used during training)
MAX_SEQUENCE_LENGTH = 100  # Replace with the actual sequence length used in training

print("Welcome to the sentiment analysis tool! Type 'stop' to exit.")

while True:
    curr_input = input("Enter your text: ")

    if curr_input.lower() == 'stop':
        print("Exiting the sentiment analysis tool. Goodbye!")
        break

    # Ensure input is not empty
    if not curr_input.strip():
        print("Please enter some text.")
        continue

    # Tokenize and pad the input text
    sequences = tokenizer.texts_to_sequences([curr_input])
    padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    # Convert to NumPy array
    padded_sequences = np.array(padded_sequences)

    # Predict sentiment
    pred = model.predict(padded_sequences, verbose=False)[0][0]

    # Display sentiment prediction
    print("Sentiment prediction:", "Positive" if pred >= 0.5 else "Negative")
    print("Confidence:", "{:.2f}%".format(pred * 100))
    print()


ValueError: Unknown value for `standardize` argument of TextVectorization. Allowed values are: ('lower_and_strip_punctuation', 'lower', 'strip_punctuation'). Received: standardize=custom_standardization