# CSC 792 Assignment 7 - Movie Rating Regression

## Mangesh Sakordekar (7440013)

### Imports

In [1]:
import os, pathlib, shutil, random
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import matplotlib.pyplot as plt
import random

#### Directory structure

In [None]:
## If val directory does not exist, run the following code to create one
'''
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)
'''

### Reading the Data

In [2]:
data_dir = 'aclImdb'  

def load_data(directory):
    reviews = []
    ratings = []
    for category in ['pos', 'neg']:
        path = os.path.join(directory, category)
        for filename in os.listdir(path):
            if filename.endswith('.txt'):
                with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
                    review = file.read()
                    reviews.append(review)
                    # Extract the rating from the filename
                    rating = int(filename.split('_')[-1].split('.')[0])
                    ratings.append(rating)
    return reviews, ratings

train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')
val_dir = os.path.join(data_dir, 'val')

# Load train and test data
train_reviews, train_ratings = load_data(train_dir)
test_reviews, test_ratings = load_data(test_dir)
val_reviews, val_ratings = load_data(test_dir)


### Vectorizing the Data

In [3]:
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
)
text_only_train_ds = train_ds.map(lambda x, y: x)

Found 20000 files belonging to 2 classes.


In [4]:
max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
text_vectorization.adapt(text_only_train_ds)


In [5]:
train_reviews_vectorized = text_vectorization(train_reviews)
test_reviews_vectorized = text_vectorization(test_reviews)
val_reviews_vectorized = text_vectorization(val_reviews)

### Data Preprocessing

In [6]:
#Convert the data to numpy arrays
train_data = np.array(train_reviews_vectorized)
train_labels = np.array(train_ratings).astype(float)
test_data = np.array(test_reviews_vectorized)
test_labels = np.array(test_ratings).astype(float)
val_data = np.array(val_reviews_vectorized)
val_labels = np.array(val_ratings).astype(float)

### Training the Model

In [119]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(
    input_dim=max_tokens, output_dim=64, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.4)(x)
outputs = layers.Dense(1, activation="selu")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="mse",
              metrics=["mae"])
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings.keras",
                                    save_best_only=True)
]
history = model.fit(train_data, train_labels, validation_data=(val_data, val_labels), 
                    epochs=15, callbacks=callbacks)
model = keras.models.load_model("embeddings.keras")
print(f"Test MAE: {model.evaluate(test_data, test_labels)[1]:.2f}")

Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_14 (Embedding)    (None, None, 64)          1280000   
                                                                 
 bidirectional_14 (Bidirecti  (None, 64)               24832     
 onal)                                                           
                                                                 
 dropout_39 (Dropout)        (None, 64)                0         
                                                                 
 dense_59 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1,304,897
Trainable params: 1,304,897
Non-trainable params: 0
________________________________________________

### Testing the Model

In [134]:
random_ind = [random.randint(0, 25000) for _ in range(10)]
for i in random_ind:
    print("predicted rating: " + str(model.predict(np.reshape(test_data[i, :], (1,600,1)), verbose=0)[0][0])
          + "\tactual rating: " + str(test_labels[i]))

predicted rating: 5.858356	actual rating: 9.0
predicted rating: 5.400948	actual rating: 7.0
predicted rating: 8.505113	actual rating: 7.0
predicted rating: 3.4114602	actual rating: 4.0
predicted rating: 8.419339	actual rating: 8.0
predicted rating: 8.477797	actual rating: 10.0
predicted rating: 3.3568032	actual rating: 3.0
predicted rating: 8.123314	actual rating: 10.0
predicted rating: 9.6205435	actual rating: 9.0
predicted rating: 1.3601909	actual rating: 1.0
