In [190]:
# Predicting Movie Success Metrics Using a Transformer Model

# Import necessary libraries
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Import transformer components
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoTokenizer, AutoModel

# For data preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import SubsetRandomSampler

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [191]:
credits = pd.read_csv('/content/raw-data/credits.csv', engine='python', on_bad_lines='skip')

In [192]:
keywords = pd.read_csv('/content/raw-data/keywords.csv')

In [193]:
# Load datasets from the 'raw-data' folder
movies_metadata = pd.read_csv('/content/raw-data/movies_metadata.csv', low_memory=False)

In [195]:
ratings = pd.read_csv('/content/ratings_average.csv')

In [196]:
def parse_json_column(df, column_name):
    """Safely parse JSON-like strings in a DataFrame column."""
    def parse_json(x):
        if pd.isna(x):
            return []
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []
    df[column_name] = df[column_name].apply(parse_json)
    return df

# Parse columns
credits = parse_json_column(credits, 'cast')
credits = parse_json_column(credits, 'crew')
keywords = parse_json_column(keywords, 'keywords')

json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'belongs_to_collection']
for column in json_columns:
    movies_metadata = parse_json_column(movies_metadata, column)

In [197]:
# Extract main actors (top 3 billed actors)
def get_top_actors(cast_list, n=3):
    if isinstance(cast_list, list):
        cast_list = sorted(cast_list, key=lambda x: x.get('order', 999))
        actors = [member.get('name') for member in cast_list[:n] if member.get('name')]
        return actors
    return []

credits['main_actors'] = credits['cast'].apply(get_top_actors)

# Extract director
def get_director(crew_list):
    if isinstance(crew_list, list):
        for member in crew_list:
            if member.get('job') == 'Director':
                return member.get('name')
    return np.nan

credits['director'] = credits['crew'].apply(get_director)

In [198]:
def get_keywords(keywords_list):
    return [kw.get('name') for kw in keywords_list if kw.get('name')]

keywords['keyword_list'] = keywords['keywords'].apply(get_keywords)

In [199]:
# Drop weird values in movies_metadata['production_countries']

movies_metadata = movies_metadata[movies_metadata['production_countries'].apply(lambda x: isinstance(x, list))]

In [200]:
# Extract production countries
def get_production_countries(production_countries_list):
    return [country.get('iso_3166_1') for country in production_countries_list if country.get('iso_3166_1')]

movies_metadata['production_countries_list'] = movies_metadata['production_countries'].apply(get_production_countries)

# Extract original language
movies_metadata['original_language'] = movies_metadata['original_language'].fillna('Unknown')

# Determine if movie is part of a collection
def has_collection(collection):
    return 1 if isinstance(collection, dict) else 0

movies_metadata['has_collection'] = movies_metadata['belongs_to_collection'].apply(has_collection)

# Convert release_date to datetime
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_metadata['production_countries_list'] = movies_metadata['production_countries'].apply(get_production_countries)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_metadata['original_language'] = movies_metadata['original_language'].fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a

In [201]:
# Ensure 'id' columns are numeric
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')

# Merge datasets
df = movies_metadata.merge(credits[['id', 'main_actors', 'director']], on='id', how='left')
df = df.merge(keywords[['id', 'keyword_list']], on='id', how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')


In [202]:
# Assuming you have already loaded your DataFrame 'df'

# Ensure 'release_date' is in datetime format
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Drop rows with missing 'release_date' if necessary
df = df.dropna(subset=['release_date'])

# Extract 'release_year', 'release_month', 'release_day'
df['release_year'] = df['release_date'].dt.year.astype(str)
df['release_month'] = df['release_date'].dt.month.astype(str)
df['release_day'] = df['release_date'].dt.day.astype(str)
# Fill missing lists with empty lists
list_columns = ['main_actors', 'keyword_list', 'production_countries_list']
for col in list_columns:
    df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [])

# Fill missing director with 'Unknown'
df['director'] = df['director'].fillna('Unknown')

# Fill missing 'has_collection' with 0
df['has_collection'] = df['has_collection'].fillna(0)

# Convert budget and revenue to numeric
df['budget'] = pd.to_numeric(df['budget'], errors='coerce').fillna(0)
df['revenue'] = pd.to_numeric(df['revenue'], errors='coerce').fillna(0)
df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce').fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['release_year'] = df['release_date'].dt.year.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['release_month'] = df['release_date'].dt.month.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['release_day'] = df['release_date'].dt.day.astype(str)
A value is trying to be s

In [203]:
# 2. Cleaning Budget and Revenue
def clean_numeric(value):
    try:
        val = int(value)
        if val == 0:
            return np.nan  # Treat 0 as missing value
        return val
    except (ValueError, TypeError):
        return np.nan

df['budget_cleaned'] = df['budget'].apply(clean_numeric)
df['revenue_cleaned'] = df['revenue'].apply(clean_numeric)

# Step 5: Target variable (revenue/budget ratio)
df['rev_budget_ratio'] = df['revenue_cleaned'] / df['budget_cleaned']

In [204]:
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = df['rev_budget_ratio'].quantile(0.25)
Q3 = df['rev_budget_ratio'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the dataframe to remove outliers
df = df[(df['rev_budget_ratio'] >= lower_bound) &
                                    (df['rev_budget_ratio'] <= upper_bound)]

In [206]:
# Merge the two DataFrames on movieId and id
df['id'] = pd.to_numeric(df['id'], errors='coerce')  # Ensure 'id' is numeric to match 'movieId'
df = pd.merge(df, ratings, left_on='id', right_on='movieId', how='left')

# Function to calculate the weighted vote average
def update_vote_average(row):
    if pd.isna(row['vote_average']) or row['vote_average'] == 0:
        return row['average_rating']  # Replace with average_rating if vote_average is NaN or 0
    if pd.notna(row['average_rating']):  # If both are present, calculate weighted average
        vote_weight = row['vote_count'] if pd.notna(row['vote_count']) else 0
        rating_weight = row['rating_count']
        total_weight = vote_weight + rating_weight
        return ((row['vote_average'] * vote_weight) + (row['average_rating'] * 2 * rating_weight)) / total_weight
    return row['vote_average']  # If no update is needed, return the original vote_average

# Apply the update_vote_average function to each row
df['vote_average_updated'] = df.apply(update_vote_average, axis=1)
df['vote_count'] = df['vote_count'] + df['rating_count']

# Drop extra columns like movieId from the merged DataFrame
df.drop(columns=['movieId', 'average_rating', 'rating_count'], inplace=True)

df.dropna(subset = ["vote_average_updated", "vote_count", "rev_budget_ratio"], inplace=True)

In [207]:
# Ensure 'vote_average' and 'popularity' are numeric and handle missing values
df['rating'] = pd.to_numeric(df['vote_average'], errors='coerce').fillna(0)
df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce').fillna(0)

# Define target variables
targets = ['rev_budget_ratio', 'rating', 'popularity']

# Ensure targets are float and handle infinities and NaNs
df[targets] = df[targets].astype(float)
df[targets] = np.nan_to_num(df[targets], nan=0.0, posinf=0.0, neginf=0.0)

In [208]:
def combine_text_features(row):
    features = []

    # Handle main_actors
    actors = row['main_actors']
    if isinstance(actors, list):
        features.extend([str(actor) for actor in actors if actor])
    elif pd.notnull(actors):
        features.append(str(actors))

    # Handle director
    director = row['director']
    if pd.notnull(director):
        features.append(str(director))
    else:
        features.append('Unknown')

    # Handle keyword_list
    keywords = row['keyword_list']
    if isinstance(keywords, list):
        features.extend([str(keyword) for keyword in keywords if keyword])
    elif pd.notnull(keywords):
        features.append(str(keywords))

    # Handle production_countries_list
    countries = row['production_countries_list']
    if isinstance(countries, list):
        features.extend([str(country) for country in countries if country])
    elif pd.notnull(countries):
        features.append(str(countries))

    # Handle original_language
    language = row['original_language']
    if pd.notnull(language):
        features.append(str(language))
    else:
        features.append('Unknown')

    # Handle has_collection
    has_collection = row['has_collection']
    if has_collection == 1:
        features.append('Collection')
    else:
        features.append('No Collection')

    # Include release date information
    release_year = row['release_year']
    release_month = row['release_month']
    release_day = row['release_day']
    features.append('ReleaseYear_' + release_year)
    features.append('ReleaseMonth_' + release_month)
    features.append('ReleaseDay_' + release_day)

    # Combine features into a single string
    return ' '.join(features)

df['text_input'] = df.apply(combine_text_features, axis=1)

In [209]:
# Using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [210]:
# Tokenize the text inputs
def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

df['tokenized'] = df['text_input'].apply(tokenize_text)

In [211]:
class MovieDataset(Dataset):
    def __init__(self, df, targets):
        self.texts = df['text_input'].tolist()
        self.targets = df[targets].values.astype(float)
        self.targets = torch.tensor(self.targets, dtype=torch.float)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = 128

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        # Tokenize on-the-fly
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze(0)
        targets = self.targets[idx]
        return input_ids, attention_mask, targets

In [212]:
# Split Data
# First, split the data into training+validation and test sets
train_full_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=False)

# Then, split the training data into training and validation sets
train_df, val_df = train_test_split(train_full_df, test_size=0.1, random_state=42, shuffle=False)

# # Truncate train_df to 500 rows for brevity
# train_df = train_df.head(500)

# Reset indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [213]:
# Normalize targets with MinMaxScaler

scaler = MinMaxScaler()

# Fit the scaler on the training targets
train_df[targets] = scaler.fit_transform(train_df[targets])

# Transform validation and test targets
val_df[targets] = scaler.transform(val_df[targets])
test_df[targets] = scaler.transform(test_df[targets])

# scaler2 = StandardScaler()
# train_df[targets] = scaler2.fit_transform(train_df[targets])

# scaler3 = MinMaxScaler()
#

In [214]:
# Create datasets
train_dataset = MovieDataset(train_df, targets)
val_dataset = MovieDataset(val_df, targets)
test_dataset = MovieDataset(test_df, targets)

# Define batch size
batch_size = 8

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



In [215]:
# Print train_loader, val_loader, test-loader sizes
print("Train Loader Size:", len(train_loader))
print("Validation Loader Size:", len(val_loader))
print("Test Loader Size:", len(test_loader))

Train Loader Size: 124
Validation Loader Size: 14
Test Loader Size: 35


In [216]:
# Model Definition
class MovieTransformerModel(nn.Module):
    def __init__(self, num_targets):
        super(MovieTransformerModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_targets)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Use [CLS] token representation
        cls_output = outputs[1]
        cls_output = self.dropout(cls_output)
        output = self.fc(cls_output)
        return output

In [217]:
# Initialize model
num_targets = len(targets)
model = MovieTransformerModel(num_targets)
model.to(device)

MovieTransformerModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [218]:
# Loss Function and Optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [219]:
# Early Stopping Implementation
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.001):
        """
        Args:
            patience (int): How many epochs to wait after last time validation loss improved.
            min_delta (float): Minimum change in the monitored quantity to qualify as an improvement.
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.best_state_dict = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_state_dict = model.state_dict()
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.best_state_dict = model.state_dict()
            self.counter = 0  # Reset counter when improvement occurs
        else:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

In [220]:
# Training Loop with Early Stopping
# Initialize early stopping
early_stopping = EarlyStopping(patience=5, min_delta=0.0001)

epochs = 1  # Increase the number of epochs to allow the model to see more data over time
batch_size = 4  # Adjust based on your computational resources

for epoch in range(epochs):
    # Sample 500 random indices from the training dataset
    num_samples = 100
    indices = np.random.choice(len(train_dataset), num_samples, replace=False)
    sampler = SubsetRandomSampler(indices)
    # Create a DataLoader with the sampler
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)

    model.train()
    total_train_loss = 0
    iteration = 0
    for input_ids, attention_mask, target in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Iteration {iteration}, Loss: {loss.item()}")
        iteration += 1

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation step remains the same
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for input_ids, attention_mask, target in val_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            target = target.to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, target)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

    # Early stopping
    early_stopping(avg_val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping triggered. Stopping training.")
        # Load the best model weights
        model.load_state_dict(early_stopping.best_state_dict)
        break

Epoch 1/1, Iteration 0, Loss: 0.11718712002038956
Epoch 1/1, Iteration 1, Loss: 0.07155942916870117
Epoch 1/1, Iteration 2, Loss: 0.053454700857400894
Epoch 1/1, Iteration 3, Loss: 0.023946762084960938
Epoch 1/1, Iteration 4, Loss: 0.0246958639472723
Epoch 1/1, Iteration 5, Loss: 0.07407664507627487
Epoch 1/1, Iteration 6, Loss: 0.014067312702536583
Epoch 1/1, Iteration 7, Loss: 0.02000158280134201
Epoch 1/1, Iteration 8, Loss: 0.013367947190999985
Epoch 1/1, Iteration 9, Loss: 0.01789010316133499
Epoch 1/1, Iteration 10, Loss: 0.03334253281354904
Epoch 1/1, Iteration 11, Loss: 0.031421490013599396
Epoch 1/1, Iteration 12, Loss: 0.047140683978796005
Epoch 1/1, Iteration 13, Loss: 0.039095327258110046
Epoch 1/1, Iteration 14, Loss: 0.03222070261836052
Epoch 1/1, Iteration 15, Loss: 0.06271322071552277
Epoch 1/1, Iteration 16, Loss: 0.05057935416698456
Epoch 1/1, Iteration 17, Loss: 0.016565149649977684
Epoch 1/1, Iteration 18, Loss: 0.03681910037994385
Epoch 1/1, Iteration 19, Loss: 0.0

In [221]:
# Evaluation on test set
model.eval()
total_test_loss = 0

num_samples = 100
indices = np.random.choice(len(test_dataset), num_samples, replace=False)
sampler = SubsetRandomSampler(indices)
# Create a DataLoader with the sampler
test_loader = DataLoader(test_dataset, batch_size=batch_size, sampler=sampler, shuffle = False)
with torch.no_grad():
    for input_ids, attention_mask, target in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        target = target.to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, target)
        total_test_loss += loss.item()

    avg_test_loss = total_test_loss / len(test_loader)
    print(f'Test Loss: {avg_test_loss:.4f}')

# Inverse Transforming Predictions (Optional)
# Collect all predictions and targets
all_outputs = []
all_targets = []
with torch.no_grad():
    for input_ids, attention_mask, target in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        outputs = model(input_ids, attention_mask)
        all_outputs.append(outputs.cpu().numpy())
        all_targets.append(target.cpu().numpy())

# Concatenate the outputs and targets
all_outputs = np.concatenate(all_outputs, axis=0)
all_targets = np.concatenate(all_targets, axis=0)

# Inverse transform to original scale
outputs_original = scaler.inverse_transform(all_outputs)
targets_original = scaler.inverse_transform(all_targets)

# Compute metrics in original scale
from sklearn.metrics import mean_squared_error

for i, target_name in enumerate(targets):
    mse = mean_squared_error(targets_original[:, i], outputs_original[:, i])
    print(f'MSE for {target_name}: {mse:.4f}')

Test Loss: 0.0192
MSE for revenue: 55844221344546816.0000
MSE for rev_budget_ratio: 3.8730
MSE for rating: 0.8336
MSE for popularity: 48.8635
