In [29]:
# Predicting Movie Success Metrics Using a Transformer Model

# Import necessary libraries
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Import transformer components
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoTokenizer, AutoModel

# For data preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Set device
device = torch.device('cpu')

In [30]:
# Load datasets from the 'raw-data' folder
credits = pd.read_csv('./raw-data/credits.csv')
keywords = pd.read_csv('./raw-data/keywords.csv')
movies_metadata = pd.read_csv('./raw-data/movies_metadata.csv', low_memory=False)

In [31]:
def parse_json_column(df, column_name):
    """Safely parse JSON-like strings in a DataFrame column."""
    def parse_json(x):
        if pd.isna(x):
            return []
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []
    df[column_name] = df[column_name].apply(parse_json)
    return df

# Parse columns
credits = parse_json_column(credits, 'cast')
credits = parse_json_column(credits, 'crew')
keywords = parse_json_column(keywords, 'keywords')

json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'belongs_to_collection']
for column in json_columns:
    movies_metadata = parse_json_column(movies_metadata, column)

In [32]:
# Extract main actors (top 3 billed actors)
def get_top_actors(cast_list, n=3):
    if isinstance(cast_list, list):
        cast_list = sorted(cast_list, key=lambda x: x.get('order', 999))
        actors = [member.get('name') for member in cast_list[:n] if member.get('name')]
        return actors
    return []

credits['main_actors'] = credits['cast'].apply(get_top_actors)

# Extract director
def get_director(crew_list):
    if isinstance(crew_list, list):
        for member in crew_list:
            if member.get('job') == 'Director':
                return member.get('name')
    return np.nan

credits['director'] = credits['crew'].apply(get_director)

In [33]:
def get_keywords(keywords_list):
    return [kw.get('name') for kw in keywords_list if kw.get('name')]

keywords['keyword_list'] = keywords['keywords'].apply(get_keywords)

In [34]:
# Drop weird values in movies_metadata['production_countries']

movies_metadata = movies_metadata[movies_metadata['production_countries'].apply(lambda x: isinstance(x, list))]

In [35]:
# Extract production countries
def get_production_countries(production_countries_list):
    return [country.get('iso_3166_1') for country in production_countries_list if country.get('iso_3166_1')]

movies_metadata['production_countries_list'] = movies_metadata['production_countries'].apply(get_production_countries)

# Extract original language
movies_metadata['original_language'] = movies_metadata['original_language'].fillna('Unknown')

# Determine if movie is part of a collection
def has_collection(collection):
    return 1 if isinstance(collection, dict) else 0

movies_metadata['has_collection'] = movies_metadata['belongs_to_collection'].apply(has_collection)

# Convert release_date to datetime
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')

In [36]:
# Ensure 'id' columns are numeric
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')

# Merge datasets
df = movies_metadata.merge(credits[['id', 'main_actors', 'director']], on='id', how='left')
df = df.merge(keywords[['id', 'keyword_list']], on='id', how='left')

In [37]:
# Drop rows with missing release dates
df = df.dropna(subset=['release_date'])

# Fill missing lists with empty lists
list_columns = ['main_actors', 'keyword_list', 'production_countries_list']
for col in list_columns:
    df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [])

# Fill missing director with 'Unknown'
df['director'] = df['director'].fillna('Unknown')

# Fill missing 'has_collection' with 0
df['has_collection'] = df['has_collection'].fillna(0)

# Convert budget and revenue to numeric
df['budget'] = pd.to_numeric(df['budget'], errors='coerce').fillna(0)
df['revenue'] = pd.to_numeric(df['revenue'], errors='coerce').fillna(0)
df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce').fillna(0)

In [38]:
# Avoid division by zero
df['budget'] = df['budget'].replace(0, np.nan)
df['rev_budget_ratio'] = df['revenue'] / df['budget']
df['rev_budget_ratio'] = df['rev_budget_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['budget'] = df['budget'].fillna(0)

In [39]:
# Target variables
df['rating'] = pd.to_numeric(df['vote_average'], errors='coerce').fillna(0)
df['popularity'] = df['popularity'].fillna(0)

# Targets: revenue, rev_budget_ratio, rating, popularity
targets = ['revenue', 'rev_budget_ratio', 'rating', 'popularity']

In [40]:
def combine_text_features(row):
    features = []
    
    # Handle main_actors
    actors = row['main_actors']
    if isinstance(actors, list):
        features.extend([str(actor) for actor in actors if actor])
    elif pd.notnull(actors):
        features.append(str(actors))
    
    # Handle director
    director = row['director']
    if pd.notnull(director):
        features.append(str(director))
    else:
        features.append('Unknown')
    
    # Handle keyword_list
    keywords = row['keyword_list']
    if isinstance(keywords, list):
        features.extend([str(keyword) for keyword in keywords if keyword])
    elif pd.notnull(keywords):
        features.append(str(keywords))
    
    # Handle production_countries_list
    countries = row['production_countries_list']
    if isinstance(countries, list):
        features.extend([str(country) for country in countries if country])
    elif pd.notnull(countries):
        features.append(str(countries))
    
    # Handle original_language
    language = row['original_language']
    if pd.notnull(language):
        features.append(str(language))
    else:
        features.append('Unknown')
    
    # Handle has_collection
    has_collection = row['has_collection']
    if has_collection == 1:
        features.append('Collection')
    else:
        features.append('No Collection')
    
    # Combine features into a single string
    return ' '.join(features)

df['text_input'] = df.apply(combine_text_features, axis=1)

In [41]:
# Using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [42]:
# Tokenize the text inputs
def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

df['tokenized'] = df['text_input'].apply(tokenize_text)

In [43]:
class MovieDataset(Dataset):
    def __init__(self, df, targets):
        self.inputs = df['tokenized'].tolist()
        self.targets = df[targets].values.astype(float)
        self.targets = torch.tensor(self.targets, dtype=torch.float)
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        input_ids = self.inputs[idx]['input_ids'].squeeze()
        attention_mask = self.inputs[idx]['attention_mask'].squeeze()
        targets = self.targets[idx]
        return input_ids, attention_mask, targets

In [44]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=False)

# Reset indices
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [45]:
batch_size = 8  # Reduce this number

train_dataset = MovieDataset(train_df, targets)
test_dataset = MovieDataset(test_df, targets)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [46]:
class MovieTransformerModel(nn.Module):
    def __init__(self, num_targets):
        super(MovieTransformerModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_targets)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Use [CLS] token representation
        cls_output = outputs[1]
        cls_output = self.dropout(cls_output)
        output = self.fc(cls_output)
        return output

In [47]:
num_targets = len(targets)

model = MovieTransformerModel(num_targets)
model.to(device)

MovieTransformerModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [48]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [49]:
import torch
print("Is CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Is CUDA available: True
Device count: 1
Current device: 0
Device name: NVIDIA GeForce 940MX


In [50]:
print("PyTorch CUDA version:", torch.version.cuda)

PyTorch CUDA version: 12.4


In [51]:
epochs = 20

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, target in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}')

RuntimeError: CUDA error: the launch timed out and was terminated
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model.eval()
total_loss = 0
with torch.no_grad():
    for input_ids, attention_mask, target in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        target = target.to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, target)
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(test_loader)
    print(f'Test Loss: {avg_loss:.4f}')