In [106]:
import pandas as pd
import numpy as np
import joblib

In [107]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch_optimizer as optim2
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split

In [108]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [109]:
data = joblib.load(filename='data.joblib')
data

Unnamed: 0,encoded_summary,encoded_tags,Action,Adventure,Animation,Comedy,Crime,Drama,Fantasy,Horror,Musical,Mystery,Sci-Fi,Thriller,Others
0,"[tensor(0.0349), tensor(0.0235), tensor(-0.025...","[tensor(0.0476), tensor(-0.0124), tensor(0.024...",0,1,0,1,0,1,0,0,0,0,0,0,0
1,"[tensor(-0.0059), tensor(-0.0665), tensor(-0.0...","[tensor(0.0418), tensor(-0.0673), tensor(0.039...",0,0,0,0,0,1,0,0,0,0,0,0,0
2,"[tensor(0.0332), tensor(-0.0196), tensor(-0.00...","[tensor(0.0014), tensor(0.0178), tensor(-0.040...",0,0,0,0,0,1,0,0,0,0,0,0,0
3,"[tensor(-0.0526), tensor(0.0546), tensor(-0.06...","[tensor(0.0443), tensor(-0.1015), tensor(-0.01...",0,0,0,0,0,0,0,1,0,0,0,0,0
4,"[tensor(-0.0492), tensor(0.0136), tensor(-0.14...","[tensor(0.0294), tensor(-0.0400), tensor(-0.10...",1,0,0,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7733,"[tensor(0.0092), tensor(0.0664), tensor(-0.163...","[tensor(0.0636), tensor(-0.0622), tensor(-0.08...",0,0,0,1,0,1,1,0,0,0,0,0,0
7734,"[tensor(0.0011), tensor(0.0388), tensor(-0.100...","[tensor(-0.0060), tensor(-0.0640), tensor(-0.0...",0,0,0,0,0,1,0,0,0,1,0,1,0
7735,"[tensor(-0.0298), tensor(-0.0026), tensor(-0.0...","[tensor(0.0337), tensor(-0.0882), tensor(0.001...",0,0,0,0,0,1,0,0,0,0,0,0,0
7736,"[tensor(-0.0506), tensor(-0.0171), tensor(0.04...","[tensor(0.0048), tensor(-0.0420), tensor(0.012...",0,0,0,0,0,0,0,0,0,0,0,0,1


In [112]:
print(f'Length of each entry in "encoded summary" column is {data['encoded_summary'][0].shape}')
print(f'Length of each entry in "encoded tags" column is {data['encoded_tags'][0].shape}')

Length of each entry in "encoded summary" column is torch.Size([384])
Length of each entry in "encoded tags" column is torch.Size([384])


Each entry in the encoded summary column is a 1D tensor of size 384 <br>
Each entry in the encoded tag column is a list, that consists of 20 tensors, each of size 384

In [113]:
encoded_summaries = data['encoded_summary']
x=[]
for encodings in encoded_summaries:
    x.append(encodings)
encoded_summaries = torch.stack(x)

In [114]:
encoded_tags = data['encoded_tags']
x=[]
for encodings in encoded_tags:
    x.append(encodings)
encoded_tags = torch.stack(x)

In [131]:
y = data.drop(['encoded_summary','encoded_tags'],axis=1)
y = y.values
y = torch.tensor(y, dtype=torch.float32)

In [132]:
print(f'The size of my input : encoded_summary is {encoded_summaries.shape}')
print(f'The size of my input : encoded_tags is {encoded_tags.shape}')
print(f'The size of my input : y (genre_labels) is {y.shape}')

The size of my input : encoded_summary is torch.Size([7738, 384])
The size of my input : encoded_tags is torch.Size([7738, 384])
The size of my input : y (genre_labels) is torch.Size([7738, 13])


In [133]:
X_train_summary, X_test_summary, X_train_tags, X_test_tags, y_train, y_test = train_test_split(
    encoded_summaries, encoded_tags, y, test_size=0.1, random_state=42)

In [134]:
print(f'The size of my input : X_train_summary is {X_train_summary.shape}')
print(f'The size of my input : X_train_tags is {X_train_tags.shape}')
print(f'The size of my input : y_train (genre_labels) is {y_train.shape}')
print(f'The size of my input : X_test_summary is {X_test_summary.shape}')
print(f'The size of my input : X_test_tags is {X_test_tags.shape}')
print(f'The size of my input : y_test (genre_labels) is {y_test.shape}')

The size of my input : X_train_summary is torch.Size([6964, 384])
The size of my input : X_train_tags is torch.Size([6964, 384])
The size of my input : y_train (genre_labels) is torch.Size([6964, 13])
The size of my input : X_test_summary is torch.Size([774, 384])
The size of my input : X_test_tags is torch.Size([774, 384])
The size of my input : y_test (genre_labels) is torch.Size([774, 13])


In [135]:
train_dataset = TensorDataset(X_train_summary, X_train_tags, y_train)
test_dataset = TensorDataset(X_test_summary, X_test_tags, y_test)

In [136]:
class MovieDataset(Dataset):
    def __init__(self, summaries, tags, labels):
        self.summaries = summaries
        self.tags = tags
        self.labels = labels

    def __len__(self):
        return len(self.summaries)

    def __getitem__(self, idx):
        summary = self.summaries[idx]
        tags = self.tags[idx]
        label = self.labels[idx]
        return summary, tags, label

In [126]:
class MovieGenreClassifier(nn.Module):
    def __init__(self):
        super(MovieGenreClassifier, self).__init__()
        self.summary_fc = nn.Linear(384, 256)
        self.tags_fc = nn.Linear(384, 256)
        self.combined_fc1 = nn.Linear(512, 256)
        self.combined_fc2 = nn.Linear(256, 128)
        self.combined_fc3 = nn.Linear(128, 64)
        self.combined_fc4 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 13)

    def forward(self, summary=None, tags=None):
        if summary is not None:
            summary_out = torch.relu(self.summary_fc(summary))
        else:
            summary_out = torch.zeros((tags.size(0), 256)).to(tags.device)

        if tags is not None:
            tags_out = torch.relu(self.tags_fc(tags))
        else:
            tags_out = torch.zeros((summary.size(0), 256)).to(summary.device)
        
        combined = torch.cat((summary_out, tags_out), dim=1)
        combined = torch.relu(self.combined_fc1(combined))
        combined = torch.relu(self.combined_fc2(combined))
        combined = torch.relu(self.combined_fc3(combined))
        combined = torch.relu(self.combined_fc4(combined))
        output = torch.sigmoid(self.output(combined))
        return output

In [137]:
batch_size = 32
num_epochs = 300

In [138]:
dataset = MovieDataset(encoded_summaries, encoded_tags, y)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [145]:
model = MovieGenreClassifier()
criterion =  nn.BCEWithLogitsLoss()
optimizer = optim2.RangerVA(model.parameters(), lr=0.001)

In [148]:
model.to(device)
prev_loss = float('inf')
patience = 10 
counter = 0
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    train_loss = 0.0
    for batch in train_dataloader:
        summaries, tags, labels = batch
        summaries, tags, labels = summaries.to(device), tags.to(device), labels.to(device)
        outputs = model(summary=summaries, tags=tags)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * summaries.size(0)
    train_loss = train_loss / len(train_dataloader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.16f}")
    
    # Early stopping criteria
    if train_loss >= prev_loss:
        counter += 1
    else:
        counter = 0
        prev_loss = train_loss
    
    if counter >= patience:
        print(f'Early stopping after epoch {epoch+1} as loss has not improved.')
        break

Epoch [1/300], Train Loss: 0.6931601892183185
Epoch [2/300], Train Loss: 0.6931600917830404
Epoch [3/300], Train Loss: 0.6931599997570295
Epoch [4/300], Train Loss: 0.6931599024244590
Epoch [5/300], Train Loss: 0.6931598138905064
Epoch [6/300], Train Loss: 0.6931597186805598
Epoch [7/300], Train Loss: 0.6931596323377027
Epoch [8/300], Train Loss: 0.6931595410991167
Epoch [9/300], Train Loss: 0.6931594507164273
Epoch [10/300], Train Loss: 0.6931593751921042
Epoch [11/300], Train Loss: 0.6931592878221712
Epoch [12/300], Train Loss: 0.6931591979872558
Epoch [13/300], Train Loss: 0.6931591191420536
Epoch [14/300], Train Loss: 0.6931590353668866
Epoch [15/300], Train Loss: 0.6931589544332966
Epoch [16/300], Train Loss: 0.6931588752457357
Epoch [17/300], Train Loss: 0.6931587955446369
Epoch [18/300], Train Loss: 0.6931587232727213
Epoch [19/300], Train Loss: 0.6931586432977356
Epoch [20/300], Train Loss: 0.6931585649318356
Epoch [21/300], Train Loss: 0.6931584876272474
Epoch [22/300], Train 

In [149]:
all_predictions = []
true_labels = []
model.eval()
test_loss = 0.0

with torch.no_grad():
    for batch in test_dataloader:
        summaries, tags, labels = batch
        summaries, tags, labels = summaries.to(device), tags.to(device), labels.to(device)
        outputs = model(summary=summaries, tags=tags)
        loss = criterion(outputs, labels)
        probabilities = torch.sigmoid(outputs)
        test_loss += loss.item() * summaries.size(0)
        all_predictions.append(probabilities.cpu().numpy())
        true_labels.append(labels.cpu().numpy())


test_loss = test_loss / len(test_dataloader.dataset)
all_predictions = np.concatenate(all_predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
print(f"Test Loss: {test_loss:.4f}")

Test Loss: 0.6932


In [150]:
print("Example Predictions:")
print(all_predictions[:10])
print("Example True Labels:")
print(true_labels[:10])

Example Predictions:
[[0.50000215 0.50000226 0.5000006  0.5000042  0.50000215 0.5000178
  0.500002   0.50000423 0.5000013  0.50000185 0.50000244 0.500003
  0.5000093 ]
 [0.5000018  0.5000019  0.5000005  0.5000035  0.5000018  0.5000154
  0.50000167 0.5000036  0.50000113 0.50000155 0.500002   0.5000025
  0.500008  ]
 [0.5000018  0.50000197 0.5000005  0.5000036  0.5000018  0.5000157
  0.5000017  0.50000364 0.50000113 0.50000155 0.5000021  0.50000256
  0.5000081 ]
 [0.50000185 0.50000197 0.50000054 0.50000364 0.50000185 0.5000159
  0.5000017  0.50000376 0.50000113 0.5000016  0.50000215 0.50000256
  0.5000082 ]
 [0.50000197 0.50000215 0.50000054 0.50000393 0.50000197 0.5000169
  0.50000185 0.500004   0.50000125 0.5000017  0.50000226 0.5000028
  0.5000088 ]
 [0.5000021  0.50000226 0.5000006  0.5000041  0.5000021  0.50001764
  0.50000197 0.50000423 0.5000013  0.5000018  0.5000024  0.5000029
  0.5000092 ]
 [0.50000215 0.50000226 0.5000006  0.5000042  0.50000215 0.5000179
  0.500002   0.5000042