# Trends Analysis

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences

In [5]:
# Load data

# Load credits.csv
credits = pd.read_csv('./raw-data/credits.csv')

# Load keywords.csv
keywords = pd.read_csv('./raw-data/keywords.csv')

# Load links.csv
links = pd.read_csv('./raw-data/links.csv')

# Load movies_metadata.csv
movies_metadata = pd.read_csv('./raw-data/movies_metadata.csv', low_memory=False)

# Load ratings.csv
ratings = pd.read_csv('./raw-data/ratings.csv')

In [6]:
# JSON to Pandas DataFrames

# Function to safely parse JSON-like strings
def parse_json_column(df, column_name):
    def parse_json(x):
        if pd.isna(x):
            return []
        try:
            return ast.literal_eval(x)
        except:
            return []
    df[column_name] = df[column_name].apply(parse_json)
    return df

# Function to flatten list of dictionaries into a DataFrame
def flatten_column(df, column_name):
    flattened_data = df[column_name].apply(pd.Series).stack().reset_index(level=1, drop=True)
    flattened_df = pd.DataFrame(list(flattened_data))
    return flattened_df

In [7]:
# Parse 'cast' and 'crew' in credits
credits = parse_json_column(credits, 'cast')
credits = parse_json_column(credits, 'crew')

# Parse 'keywords' in keywords
keywords = parse_json_column(keywords, 'keywords')

# Parse relevant columns in movies_metadata
json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'belongs_to_collection']
for column in json_columns:
    movies_metadata = parse_json_column(movies_metadata, column)

In [8]:
def get_top_actors(cast_list, n=3):
    """Extract top N actor names from the cast list."""
    if isinstance(cast_list, list):
        # Sort cast by 'order' to get the billing order
        cast_list = sorted(cast_list, key=lambda x: x.get('order', 999))
        actors = [member.get('name') for member in cast_list[:n] if member.get('name')]
        return actors
    return []

credits['main_actors'] = credits['cast'].apply(get_top_actors)

In [9]:
def get_director(crew_list):
    """Extract the director's name from the crew list."""
    if isinstance(crew_list, list):
        for member in crew_list:
            if member.get('job') == 'Director':
                return member.get('name')
    return np.nan

credits['director'] = credits['crew'].apply(get_director)

In [10]:
def get_keywords(keywords_list):
    """Extract keyword names."""
    return [kw.get('name') for kw in keywords_list if kw.get('name')]

keywords['keyword_list'] = keywords['keywords'].apply(get_keywords)

In [11]:
def get_production_countries(production_countries_list):
    """Extract country codes from production_countries."""
    return [country.get('iso_3166_1') for country in production_countries_list if country.get('iso_3166_1')]

movies_metadata['production_countries_list'] = movies_metadata['production_countries'].apply(get_production_countries)

TypeError: 'float' object is not iterable

In [None]:
movies_metadata['original_language'] = movies_metadata['original_language'].fillna('Unknown')

In [None]:
def has_collection(collection):
    """Determine if a movie is part of a collection."""
    return 1 if isinstance(collection, dict) else 0

movies_metadata['has_collection'] = movies_metadata['belongs_to_collection'].apply(has_collection)

In [None]:
# Convert 'release_date' to datetime
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')

In [None]:
# Convert 'id', 'budget', 'revenue', 'popularity' to numeric
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')
movies_metadata['budget'] = pd.to_numeric(movies_metadata['budget'], errors='coerce')
movies_metadata['revenue'] = pd.to_numeric(movies_metadata['revenue'], errors='coerce')
movies_metadata['popularity'] = pd.to_numeric(movies_metadata['popularity'], errors='coerce')
movies_metadata['vote_average'] = pd.to_numeric(movies_metadata['vote_average'], errors='coerce')
movies_metadata['vote_count'] = pd.to_numeric(movies_metadata['vote_count'], errors='coerce')

In [None]:
# Convert 'movieId' in ratings and 'tmdbId' in links to numeric
ratings['movieId'] = pd.to_numeric(ratings['movieId'], errors='coerce')
links['movieId'] = pd.to_numeric(links['movieId'], errors='coerce')
links['tmdbId'] = pd.to_numeric(links['tmdbId'], errors='coerce')

In [None]:
# Merge ratings and links on 'movieId'
ratings_links = ratings.merge(links[['movieId', 'tmdbId']], on='movieId', how='left')

In [None]:
# Group by 'tmdbId' to compute average ratings and count of ratings
average_ratings = ratings_links.groupby('tmdbId')['rating'].agg(['mean', 'count']).reset_index()
average_ratings.rename(columns={'mean': 'average_rating', 'count': 'rating_count'}, inplace=True)

In [None]:
# Remove entries with missing 'tmdbId'
average_ratings = average_ratings[average_ratings['tmdbId'].notnull()]
average_ratings['tmdbId'] = average_ratings['tmdbId'].astype(int)

In [None]:
# Ensure 'id' columns are of the same type
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')

# Merge credits and keywords with movies_metadata
df = movies_metadata.merge(credits[['id', 'main_actors', 'director']], on='id', how='left')
df = df.merge(keywords[['id', 'keyword_list']], on='id', how='left')

In [None]:
# Merge average ratings with df on 'id' and 'tmdbId'
df = df.merge(average_ratings, left_on='id', right_on='tmdbId', how='left')

In [None]:
# Select relevant features
df = df[['id', 'title', 'release_date', 'main_actors', 'director', 'keyword_list',
         'production_countries_list', 'original_language', 'has_collection',
         'budget', 'revenue', 'popularity', 'average_rating', 'rating_count', 'vote_average', 'vote_count']]

In [None]:
# Drop rows with missing release dates
df = df.dropna(subset=['release_date'])

# Fill missing values in lists with empty lists
df['main_actors'] = df['main_actors'].apply(lambda x: x if isinstance(x, list) else [])
df['keyword_list'] = df['keyword_list'].apply(lambda x: x if isinstance(x, list) else [])
df['production_countries_list'] = df['production_countries_list'].apply(lambda x: x if isinstance(x, list) else [])

# Fill missing numerical values with 0
numerical_columns = ['budget', 'revenue', 'popularity', 'average_rating', 'rating_count', 'vote_average', 'vote_count']
df[numerical_columns] = df[numerical_columns].fillna(0)

In [None]:
# Avoid division by zero
df['budget'] = df['budget'].replace(0, np.nan)
df['revenue_budget_ratio'] = df['revenue'] / df['budget']
df['revenue_budget_ratio'] = df['revenue_budget_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['budget'] = df['budget'].fillna(0)

In [None]:
# Combine 'average_rating' and 'vote_average'
df['final_rating'] = df.apply(lambda x: x['average_rating'] if x['average_rating'] > 0 else x['vote_average'], axis=1)

In [None]:
df = df.sort_values(by='release_date').reset_index(drop=True)

In [None]:
# Initialize tokenizers for different features

# Initialize vocabularies
actor_vocab = {}
director_vocab = {}
keyword_vocab = {}
country_vocab = {}
language_vocab = {}

# Build actor vocabulary
actor_counter = 1  # Starting index
for actors in df['main_actors']:
    for actor in actors:
        if actor not in actor_vocab:
            actor_vocab[actor] = actor_counter
            actor_counter += 1
            
# Build director vocabulary
director_counter = 1
for director in df['director']:
    if director not in director_vocab:
        director_vocab[director] = director_counter
        director_counter += 1

# Build keyword vocabulary
keyword_counter = 1
for keywords in df['keyword_list']:
    for keyword in keywords:
        if keyword not in keyword_vocab:
            keyword_vocab[keyword] = keyword_counter
            keyword_counter += 1
            
# Build country vocabulary
country_counter = 1
for countries in df['production_countries_list']:
    for country in countries:
        if country not in country_vocab:
            country_vocab[country] = country_counter
            country_counter += 1
            
# Build language vocabulary
language_counter = 1
for lang in df['original_language']:
    if lang not in language_vocab:
        language_vocab[lang] = language_counter
        language_counter += 1
        
        


In [None]:
# Map actors to sequences
df['actor_seq'] = df['main_actors'].apply(lambda actors: [actor_vocab[actor] for actor in actors])

# Map director to sequence
df['director_seq'] = df['director'].apply(lambda director: [director_vocab[director]])

# Map keywords to sequences
df['keyword_seq'] = df['keyword_list'].apply(lambda keywords: [keyword_vocab[keyword] for keyword in keywords])


# Map countries to sequences
df['country_seq'] = df['production_countries_list'].apply(lambda countries: [country_vocab[country] for country in countries])


# Map language to sequence
df['language_seq'] = df['original_language'].apply(lambda lang: [language_vocab[lang]])


In [None]:
def pad_sequence(seq, maxlen):
    seq = seq[:maxlen]
    seq = seq + [0] * (maxlen - len(seq))
    return seq

In [None]:
# Define maximum lengths for each sequence
max_actor_length = 3  # Since we have top 3 actors
max_director_length = 1
max_keyword_length = 10  # Adjust based on data
max_country_length = 3
max_language_length = 1

# Pad actor sequences
df['actor_seq_padded'] = df['actor_seq'].apply(lambda seq: pad_sequence(seq, max_actor_length))

# Pad director sequences
df['director_seq_padded'] = df['director_seq'].apply(lambda seq: pad_sequence(seq, max_director_length))

# Pad keyword sequences
df['keyword_seq_padded'] = df['keyword_seq'].apply(lambda seq: pad_sequence(seq, max_keyword_length))

# Pad country sequences
df['country_seq_padded'] = df['country_seq'].apply(lambda seq: pad_sequence(seq, max_country_length))

# Pad language sequences
df['language_seq_padded'] = df['language_seq'].apply(lambda seq: pad_sequence(seq, max_language_length))

In [None]:
def combine_sequences(row):
    sequences = []
    sequences.extend(row['actor_seq_padded'])
    sequences.extend(row['director_seq_padded'])
    sequences.extend(row['keyword_seq_padded'])
    sequences.extend(row['country_seq_padded'])
    sequences.extend(row['language_seq_padded'])
    sequences.append(row['has_collection'])
    return sequences

df['input_sequence'] = df.apply(combine_sequences, axis=1)

In [None]:
# Convert input sequences to NumPy array
X = np.array(df['input_sequence'].tolist())

# Extract target variables
y_revenue = df['revenue'].values
y_rev_budget_ratio = df['revenue_budget_ratio'].values
y_rating = df['final_rating'].values
y_popularity = df['popularity'].values

# Combine target variables into a single array
y_targets = np.vstack((y_revenue, y_rev_budget_ratio, y_rating, y_popularity)).T

In [None]:
# Scale target variables using StandardScaler
scaler = StandardScaler()
y_targets_scaled = scaler.fit_transform(y_targets)

In [None]:
# Define weights for each metric (adjust as needed)
weights = np.array([0.4, 0.3, 0.2, 0.1])

# Compute the combined target variable
combined_target = y_targets_scaled @ weights

In [None]:
# Append combined target as the last column
y_targets_combined = np.hstack((y_targets_scaled, combined_target.reshape(-1, 1)))

In [None]:
class MovieDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.LongTensor(X)
        self.y = torch.FloatTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_targets_combined, test_size=0.2, shuffle=False)

In [None]:
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
class MovieLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1):
        super(MovieLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embeds = self.embedding(x)
        # Assuming sequence length is small, we can get the last output
        lstm_out, _ = self.lstm(embeds)
        # Use the last output of the LSTM
        out = lstm_out[:, -1, :]
        out = self.fc(out)
        return out

In [None]:
# Parameters
# Calculate total vocabulary size
vocab_size = max(
    max(actor_vocab.values(), default=0),
    max(director_vocab.values(), default=0),
    max(keyword_vocab.values(), default=0),
    max(country_vocab.values(), default=0),
    max(language_vocab.values(), default=0)
) + 1  # Plus one for padding (index 0)

embedding_dim = 128
hidden_dim = 64
output_dim = y_targets_combined.shape[1]  # Number of target variables (including combined target)

model = MovieLSTM(vocab_size, embedding_dim, hidden_dim, output_dim)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}')

In [None]:
model.eval()
test_loss = 0.0
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        test_loss += loss.item() * inputs.size(0)
    
    test_loss /= len(test_dataset)
    print(f'Test Loss: {test_loss:.4f}')