In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
import os
import tensorflow as tf

In [2]:
# Import clean dataset
IN_COLAB = False

try:
    import google.colab
    from google.colab import drive
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    # mount our google drive
    drive.mount('/content/drive', force_remount=True)
    data_dir = "/content/drive/MyDrive/WM 23-24/DATA 340 NLP/NLP Final Project/datasets/"
else:
    data_dir = "datasets/"

data = pd.read_csv(str(data_dir) + "clean_movies.csv")

Mounted at /content/drive


### Convert movie descriptions and genres to BERT embeddings

---

In [4]:
data.head(5)

Unnamed: 0,id,title,genres,original_language,overview,popularity,release_date,budget,revenue,vote_average,vote_count
0,615656,Meg 2: The Trench,Action-Science Fiction-Horror,en,An exploratory dive into the deepest depths of...,8763.998,2023-08-02,129000000.0,352056482.0,7.079,1365.0
1,758323,The Pope's Exorcist,Horror-Mystery-Thriller,en,Father Gabriele Amorth Chief Exorcist of the V...,5953.227,2023-04-05,18000000.0,65675816.0,7.433,545.0
2,667538,Transformers: Rise of the Beasts,Action-Adventure-Science Fiction,en,When a new threat capable of destroying the en...,5409.104,2023-06-06,200000000.0,407045464.0,7.34,1007.0
3,693134,Dune: Part Two,Science Fiction-Adventure,en,Follow the mythic journey of Paul Atreides as ...,4742.163,2024-02-27,190000000.0,683813734.0,8.3,2770.0
4,640146,Ant-Man and the Wasp: Quantumania,Action-Adventure-Science Fiction,en,Super-Hero partners Scott Lang and Hope van Dy...,4425.387,2023-02-15,200000000.0,475766228.0,6.507,2811.0


In [83]:
# Import tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [84]:
# Create movie description tensorflow dataset
movie_descs = data['overview']

In [106]:
# Define a function to encode movie descriptions
def BERTencode(text):
  encoded_input = tokenizer(text,
                          max_length=20,
                          padding='max_length',
                          truncation=True,
                          return_tensors='tf')
  return encoded_input

In [107]:
# Encode the movie descriptions
encoded_movie_desc_inputs = BERTencode(movie_descs.tolist())

In [108]:
# Create a TensorFlow Dataset from the encoded movie descriptions
tf_encoded_descs = tf.data.Dataset.from_tensor_slices(encoded_movie_desc_inputs)

In [95]:
# fake = [
#     "A thrilling adventure in space.",
#     "A romantic comedy set in Paris.",
#     "A gritty crime drama in New York City.",
#         "A thrilling adventure in space.",
#     "A romantic comedy set in Paris.",
#     "A gritty crime drama in New York City.",
#         "A thrilling adventure in space.",
#     "A romantic comedy set in Paris.",
#     "A gritty crime drama in New York City."
# ]

# fake_encodings = BERTencode(fake)
# tf_fake_encodings = tf.data.Dataset.from_tensor_slices(fake_encodings)

In [118]:
# Use batch processing to get BERT embeddings for the descriptions.
BATCH_SIZE = 100
batched_dataset = tf_encoded_descs.batch(BATCH_SIZE)
# batched_dataset = tf_fake_encodings.batch(BATCH_SIZE)

# Initialize a list to collect embeddings
embeddings_ls = []
# Process the batches with the BERT model
for i, batch in enumerate(batched_dataset):
    # Get input_ids, attention_mask, and token_type_ids from the batch
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    token_type_ids = batch['token_type_ids']

    # Call the BERT model with the inputs
    outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    # Get the embeddings
    # embeddings = outputs.last_hidden_state
    po_embeddings = outputs.pooler_output
    embeddings_ls.append(po_embeddings.numpy())
    print(f"Batch {i+1}/{(round(len(movie_descs)/BATCH_SIZE)+1)}..")
all_embeddings = tf.concat(embeddings_ls, axis=0)


Batch 1/172..
Batch 2/172..
Batch 3/172..
Batch 4/172..
Batch 5/172..
Batch 6/172..
Batch 7/172..
Batch 8/172..
Batch 9/172..
Batch 10/172..
Batch 11/172..
Batch 12/172..
Batch 13/172..
Batch 14/172..
Batch 15/172..
Batch 16/172..
Batch 17/172..
Batch 18/172..
Batch 19/172..
Batch 20/172..
Batch 21/172..
Batch 22/172..
Batch 23/172..
Batch 24/172..
Batch 25/172..
Batch 26/172..
Batch 27/172..
Batch 28/172..
Batch 29/172..
Batch 30/172..
Batch 31/172..
Batch 32/172..
Batch 33/172..
Batch 34/172..
Batch 35/172..
Batch 36/172..
Batch 37/172..
Batch 38/172..
Batch 39/172..
Batch 40/172..
Batch 41/172..
Batch 42/172..
Batch 43/172..
Batch 44/172..
Batch 45/172..
Batch 46/172..
Batch 47/172..
Batch 48/172..
Batch 49/172..
Batch 50/172..
Batch 51/172..
Batch 52/172..
Batch 53/172..
Batch 54/172..
Batch 55/172..
Batch 56/172..
Batch 57/172..
Batch 58/172..
Batch 59/172..
Batch 60/172..
Batch 61/172..
Batch 62/172..
Batch 63/172..
Batch 64/172..
Batch 65/172..
Batch 66/172..
Batch 67/172..
Batc

In [1]:
# data['desc_embeddings'] =
all_embeddings.numpy().shape

NameError: name 'all_embeddings' is not defined

In [11]:
# BERTencode(list(data['overview'])[:5]

In [75]:
# movie_descs = [
#     "A thrilling adventure in space.",
#     "A romantic comedy set in Paris.",
#     "A gritty crime drama in New York City.",
#         "A thrilling adventure in space.",
#     "A romantic comedy set in Paris.",
#     "A gritty crime drama in New York City.",
#         "A thrilling adventure in space.",
#     "A romantic comedy set in Paris.",
#     "A gritty crime drama in New York City."
# ]

In [None]:
# # Process data in subsets
# SUBSET_SIZE = 5
# BATCH_SIZE = 3
# num_batches = len(movie_descs) // SUBSET_SIZE + 1
# for i in range(num_batches):
#     start_idx = i * SUBSET_SIZE
#     end_idx = min((i + 1) * SUBSET_SIZE, len(movie_descs))
#     batch = movie_descs[start_idx:end_idx]
#     print(batch)
#     # Convert batch to a TensorFlow Dataset
#     tf_movie_descs = tf.data.Dataset.from_tensor_slices(batch)
#     # Batch and process the subset of dataset
#     tf_movie_descs = tf_movie_descs.batch(BATCH_SIZE)
#     # print(tf_movie_descs)
#     for i in tf_movie_descs.take(1):
#       print(i)
#     # tf_movie_descs = tf_movie_descs.map(encode_batch)

In [50]:
# # Tokenize and input movie descriptions into BERT model
# encoded_inputs = [BERTencode(description) for description in movie_descs]
# # output = model(encoded_input)

In [51]:
# # Concatanate encoded inputs
# # Initialize final encoded inputs dataset
# for i, encoded_input in enumerate(encoded_inputs):
#   if i == 0:
#     f_encoded_inputs = encoded_input['input_ids']
#   else:
#     f_encoded_inputs = tf.concat([f_encoded_inputs, encoded_input['input_ids']], axis=0)

In [52]:
# tf_encoded_inputs = tf.data.Dataset.from_tensor_slices(f_encoded_inputs)

In [54]:
# encoded_input_batches = tf_encoded_inputs.batch(12)

In [None]:
# # Process the batches with the BERT model
# for batch in batched_dataset:
#     # Get input_ids, attention_mask, and token_type_ids from the batch
#     input_ids = batch['input_ids']
#     attention_mask = batch['attention_mask']
#     token_type_ids = batch['token_type_ids']

#     # Call the BERT model with the inputs
#     outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

#     # Process the outputs as needed
#     # For example, you can extract the hidden states or pooled output
#     hidden_states = outputs.last_hidden_state
#     pooled_output = outputs.pooler_output

#     # Do further processing with the outputs
#     print(hidden_states.shape)
#     print(pooled_output.shape)
# output = model(f_encoded_inputs)