Class September 19, ENG 620


In [None]:
import numpy as np
import spacy
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from multiprocessing import Pool, cpu_count

In [None]:
# Enable GPU processing
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
# Load SpaCy English model
nlp = spacy.load('en_core_web_sm')

# Load IMDB dataset
max_features = 10000  # Number of words to consider as features
max_length = 100  # Cut texts after this number of words (among top max_features most common words)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Print out the content of a sample of the imdb databset
print(f"x_train: {x_train[0]}")
print(f"y_train: {y_train[0]}")

x_train: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
y_train: 1


In [None]:
# Read the content of one of the data
word_index = imdb.get_word_index()
index_to_word = {index: word for word, index in word_index.items()}

def decode_review(encoded_review):
  decoded_review = " ".join([index_to_word.get(i - 3, '') for i in encoded_review])
  return decoded_review

# Example usage
decoded_review = decode_review(x_train[0])
print(decoded_review)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
 this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert  is an amazing actor and now the same being director  father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for  and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also  to the two little boy's that played the  of norman and paul they were just brilliant children are often left out of the  list i think

In [None]:
# Decode function to convert integers back to words
word_index = imdb.get_word_index()
index_to_word = {v: k for k, v in word_index.items()}

# Convert indices back to words for preprocessing with SpaCy
def decode_review(encoded_review):
    return ' '.join(index_to_word.get(i - 3, '?') for i in encoded_review if i >= 3)

# Preprocess a single review with SpaCy
def preprocess_spacy(text):
    doc = nlp(text)
    return ' '.join(token.lemma_ for token in doc if not token.is_stop and not token.is_punct)

# Wrapper function for parallel processing
def preprocess_review(encoded_review):
    decoded = decode_review(encoded_review)
    preprocessed = preprocess_spacy(decoded)
    return preprocessed

# Preprocess dataset in parallel using Pool
def preprocess_dataset_parallel(dataset, num_workers=cpu_count()):
    with Pool(num_workers) as pool:
        return pool.map(preprocess_review, dataset)

# Preprocess training and testing data
preprocessed_train = preprocess_dataset_parallel(x_train[:10000])  # Subset for demonstration
preprocessed_test = preprocess_dataset_parallel(x_test[:1000])  # Subset for demonstration

In [None]:
# Print the one of the preprocessed sentences
print(f"preprocessed_train: {preprocessed_train[0]}")

preprocessed_train: film brilliant cast location scenery story direction suit play imagine robert amazing actor director father come scottish island love fact real connection film witty remark film great brilliant buy film soon release recommend watch fly fishing amazing cry end sad know cry film good definitely little boy play norman paul brilliant child leave list think star play grow big profile film child amazing praise think story lovely true life share


In [None]:
# Tokenization and padding using Keras Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(preprocessed_train)
sequences_train = tokenizer.texts_to_sequences(preprocessed_train)
sequences_test = tokenizer.texts_to_sequences(preprocessed_test)
padded_train = pad_sequences(sequences_train, maxlen=max_length, padding='post')
padded_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

y_train_subset = y_train[:10000]
x_train_subset = x_train[:10000]
y_test_subset = y_test[:1000]
x_test_subset = x_test[:1000]

In [None]:
# Verify the two subsets
print(f"padded_train: {padded_train[0]}")
print(f"y_train_subset: {y_train_subset[0]}")

padded_train: [   4  345   60  725  999   10  299  859   22  492  424  322   33   43
  145   23 3212  825   21   71   44 1277    4 1509 3057    4   15  345
  284    4  344  180  151    9  662 3749  322  607   26  419   17  607
    4    6  257   35  165   22 2430  535  345  113   63  589   13   53
   22  377   61 5420    4  113  322 1510   13   10 1167  130   29  744
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
y_train_subset: 1


In [None]:
### Step 3: Define and Train the Model
from tensorflow.keras.layers import LSTM, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Embedding(input_dim=max_features, output_dim=64, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# for lr in [1e-4]:
# print(f"Training with learning rate: {lr}")
# optimizer = Adam(learning_rate=lr)
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model 1
model.fit(padded_train, y_train_subset, epochs=5, batch_size=32, validation_split=0.2)



Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.6489 - loss: 0.6580 - val_accuracy: 0.8550 - val_loss: 0.4142
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8817 - loss: 0.3437 - val_accuracy: 0.8595 - val_loss: 0.3337
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9158 - loss: 0.2330 - val_accuracy: 0.8605 - val_loss: 0.3345
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9468 - loss: 0.1730 - val_accuracy: 0.8570 - val_loss: 0.3535
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9619 - loss: 0.1299 - val_accuracy: 0.8580 - val_loss: 0.3768


<keras.src.callbacks.history.History at 0x7e8549169000>

In [None]:
print(model.summary())

None


# About LSTM

##### For LSTM, the max_length need to be small, if there are too many paddings, too many zeros, the LSTM will waste resources on unnecessary data, which result in inaccuracy

In [None]:
# Make another neural network model using embedding and lstm
model2 = Sequential([
    Embedding(input_dim=max_features, output_dim=100, input_length=max_length),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
model2.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model 2
model2.fit(padded_train, y_train_subset, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.5603 - loss: 0.6805 - val_accuracy: 0.7765 - val_loss: 0.5300
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8121 - loss: 0.4781 - val_accuracy: 0.7705 - val_loss: 0.5475
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8199 - loss: 0.4795 - val_accuracy: 0.7980 - val_loss: 0.5177
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7042 - loss: 0.5549 - val_accuracy: 0.7940 - val_loss: 0.4928
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8634 - loss: 0.3721 - val_accuracy: 0.8230 - val_loss: 0.4102
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8702 - loss: 0.3192 - val_accuracy: 0.8405 - val_loss: 0.3937
Epoch 7/10
[1m250/250[0m 

<keras.src.callbacks.history.History at 0x7e8548e82290>

In [None]:
print(model2.summary())

None


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(padded_test, y_test_subset)
print(f'Loss: {loss}, Accuracy: {accuracy}')

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8566 - loss: 0.3517
Loss: 0.3799300491809845, Accuracy: 0.847000002861023


In [None]:
# Evaluate model 2
loss, accuracy = model2.evaluate(padded_test, y_test_subset)
print(f'Loss: {loss}, Accuracy: {accuracy}')

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8026 - loss: 0.4280
Loss: 0.4407954216003418, Accuracy: 0.7990000247955322


In [None]:
# Example sentences
example_sentences = [
    "The movie was an absolute masterpiece with breathtaking visuals and a storyline that kept me utterly engaged from start to finish.",
    "I was disappointed by this film; the script felt weak and the characters were underdeveloped, leading to a lackluster viewing experience.",
    "An emotional rollercoaster that masterfully captures the essence of human relationships. Highly recommended for a heartwarming experience.",
    "Despite the high expectations, the movie failed to deliver a coherent plot or engaging characters, making it difficult to sit through.",
    "A thoroughly enjoyable film that combined humor and drama in just the right amounts, leaving me both laughing and crying by the end.",
    "The film’s lack of originality and clichéd plot points made it predictable and uninteresting. I wouldn’t recommend it.",
    "Outstanding performances by the cast brought life to a complex and gripping story that had me on the edge of my seat.",
    "The movie was a huge letdown; the storyline was convoluted and the special effects seemed overdone and distracting.",
    "A stunning display of visual artistry, this film captivated me with its creativity and originality.",
    "The pacing was excruciatingly slow, and despite a promising premise, the film struggled to maintain my interest.",
    "The soundtrack was amazing, enhancing each scene beautifully and adding depth to the film's emotional impact.",
    "Unfortunately, the plot was filled with holes, leaving too many unanswered questions and an unsatisfying ending.",
    "This film was incredible; the director’s vision was clear and executed perfectly, resulting in an unforgettable experience.",
    "The dialogue was stilted, making it hard to connect with the characters and their journeys throughout the movie.",
    "A brilliant piece of storytelling that weaves a rich narrative tapestry with complex characters and unexpected turns.",
    "I found the movie overly long and drawn out, lacking the excitement or tension needed to keep my attention.",
    "An inspirational film that delivered a powerful message without being preachy, making for a truly uplifting experience.",
    "The acting was so poor that it was almost comical at times, detracting significantly from what could have been a decent film.",
    "A thought-provoking movie that tackles complex themes with nuance and insight, leaving a lasting impression.",
    "The overuse of CGI was off-putting, and the reliance on visual spectacle couldn't mask the movie's other shortcomings.",
    "A heartwarming story with a relatable protagonist, this film beautifully conveyed the power of determination and friendship.",
    "With unconvincing performances and a plot full of inconsistencies, this film failed to engage or entertain.",
    "From the first scene to the last, the film was a delightful combination of wit, charm, and emotional depth.",
    "I struggled to stay awake; the monotonous pace and lack of dynamic storytelling made it a tedious watch.",
    "A truly unique film that broke new ground with its innovative approach and unconventional narrative style.",
    "The narrative was so incoherent that it left me more confused than entertained, with a climax that failed to deliver.",
    "An endearing cast and a heart-felt story made this film a true joy to watch, leaving me with a smile on my face.",
    "Despite its big-name stars, the film fell flat due to a lack of chemistry and unoriginal writing.",
    "An intense thriller that kept me guessing with its intricate plot twists and suspenseful build-up.",
    "I found the movie to be pretentious and overbearing, trying too hard to deliver a message that was ultimately unclear.",
    "A captivating film that managed to successfully blend historical events with rich personal stories in a compelling way.",
    "The film’s attempts at humor felt forced and unnatural, leaving me cringing rather than laughing.",
    "An extraordinary journey depicted with stunning artistry, both visually and narratively, highly recommend watching.",
    "Poorly edited and with a fragmented storyline, the film left me feeling frustrated and unsatisfied.",
    "A spellbinding tale that transported me to another world, with seamless integration of stunning visuals and sound.",
    "The acting was over-the-top and the character development was shallow, making it hard to root for anyone.",
    "An insightful documentary that shed light on a critical issue, offering a balanced and thought-provoking perspective.",
    "For a movie with so much potential, it was shocking how little it achieved, ultimately a forgettable experience.",
    "A heart-breaking narrative that portrayed real human struggles with honesty and empathy, leaving a deep impact.",
    "The movie was almost unbearable due to its lackluster script and uninspired directing choices.",
    "An engaging plot with well-developed characters that drew me in and kept me fascinated until the very end.",
    "I was unimpressed by the movie's predictability and failure to innovate beyond standard genre conventions.",
    "A powerful performance by the lead actor, making a compelling and hauntingly beautiful cinematic experience.",
    "The dialogue was poorly written, and the movie’s over-reliance on tired tropes made it difficult to watch.",
    "An exhilarating action-packed adventure that was thrilling from the outset and didn’t disappoint.",
    "Despite lavish production design, the movie lacked substance and relied too heavily on visual spectacle.",
    "A deeply moving story of hope and resilience, beautifully captured with stunning visuals and poignant music.",
    "The film’s glaring plot holes and lack of character depth contributed to its overall mediocrity.",
    "A cinematic triumph, this film captured my heart with its storytelling excellence and emotional resonance.",
    "The narrative was confusing, and the character motivations were unclear, leading to an unsatisfying viewing experience."
]

labels = [
    1, 0, 1, 0, 1, 0,
    1, 0, 1, 0, 1, 0,
    1, 0, 1, 0, 1, 0,
    1, 0, 1, 0, 1, 0,
    1, 0, 1, 0, 1, 0,
    1, 0, 1, 0, 1, 0,
    1, 0, 1, 0, 1, 0,
    1, 0, 1, 0, 1, 0, 1, 0
]

print(f"Total sentences: {len(example_sentences)}")
print(f"Total labels: {len(labels)}")

# Preprocess example sentences using SpaCy
preprocessed_example_sentences = [preprocess_spacy(sentence) for sentence in example_sentences]
encoded_example_sentences = tokenizer.texts_to_sequences(preprocessed_example_sentences)
padded_example_sentences = pad_sequences(encoded_example_sentences, maxlen=max_length, padding='post')

Total sentences: 50
Total labels: 50


In [None]:
# Read the content of one of the data

# Create a reverse word index
reverse_word_index = {index: word for word, index in tokenizer.word_index.items()}

# Function to decode an encoded sentence
def decode_sequence(encoded_sequence):
    return ' '.join([reverse_word_index.get(idx, '?') for idx in encoded_sequence if idx != 0])

# Decode example sentences from padded sequences
decoded_sentences = [decode_sequence(seq) for seq in padded_example_sentences]

# Compare the original and decoded sentences
for original, decoded in zip(preprocessed_example_sentences, decoded_sentences):
    print(f"Original: {original}")
    print(f"Decoded: {decoded}\n")


Original: movie absolute masterpiece breathtaking visual storyline keep utterly engage start finish
Decoded: movie absolute masterpiece breathtaking visual storyline keep utterly engage start finish

Original: disappoint film script feel weak character underdeveloped lead lackluster viewing experience
Decoded: disappoint film script feel weak character underdeveloped lead lackluster viewing experience

Original: emotional rollercoaster masterfully capture essence human relationship highly recommend heartwarming experience
Decoded: emotional <OOV> masterfully capture essence human relationship highly recommend heartwarming experience

Original: despite high expectation movie fail deliver coherent plot engage character make difficult sit
Decoded: despite high expectation movie fail deliver coherent plot engage character make difficult sit

Original: thoroughly enjoyable film combine humor drama right amount leave laugh cry end
Decoded: thoroughly enjoyable film combine humor drama right 

In [None]:
# read one of the padded example sentences
print(f"padded_example_sentences: {padded_example_sentences[30]}")

padded_example_sentences: [2439    4  311 2628 2746 1070  327  743  732   10 1124   19    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [None]:
# Predict sentiment on example sentences using model 1 and model2
predictions = model.predict(padded_example_sentences)
predictions_lstm = model2.predict(padded_example_sentences)

# Print the results and evaluate accuracy
correct_predictions = 0
correct_predictions_lstm = 0

for i, sentence in enumerate(example_sentences):
    predicted_label = 1 if predictions[i] >= 0.5 else 0
    predicted_label_lstm = 1 if predictions_lstm[i] >= 0.5 else 0
    actual_label = labels[i]
    if predicted_label == actual_label:
        correct_predictions += 1
    if predicted_label_lstm == actual_label:
        correct_predictions_lstm += 1
    print(f"Sentence: {sentence}")
    print(f"Actual Sentiment: {'Positive' if actual_label == 1 else 'Negative'}")
    print(f"Predicted Sentiment Dense: {'Positive' if predicted_label == 1 else 'Negative'} (Score: {predictions[i][0]})")
    print(f"Predicted Sentiment LSTM: {'Positive' if predicted_label_lstm == 1 else 'Negative'} (Score: {predictions_lstm[i][0]})")
    print()

total_sentences = len(example_sentences)
accuracy = correct_predictions / total_sentences
print(f"Total Accuracy: {accuracy:.2f}")

accuracy_lstm = correct_predictions_lstm / total_sentences
print(f"Total Accuracy: {accuracy_lstm:.2f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
Sentence: The movie was an absolute masterpiece with breathtaking visuals and a storyline that kept me utterly engaged from start to finish.
Actual Sentiment: Positive
Predicted Sentiment Dense: Positive (Score: 0.8538950681686401)
Predicted Sentiment LSTM: Negative (Score: 0.292969673871994)

Sentence: I was disappointed by this film; the script felt weak and the characters were underdeveloped, leading to a lackluster viewing experience.
Actual Sentiment: Negative
Predicted Sentiment Dense: Negative (Score: 0.2867172062397003)
Predicted Sentiment LSTM: Negative (Score: 0.2929668128490448)

Sentence: An emotional rollercoaster that masterfully captures the essence of human relationships. Highly recommended for a heartwarming experience.
Actual Sentiment: Positive
Predicted Sentiment Dense: Positive (Score: 0.9942673444747925)
Predicted Sentim