# Pipeline:
After importing the necessary functions, we load the data from the 'movie_labels.csv' file. For every genre, we make the list of movie_labels where we have a label for every movie. Additionally for every genre, we make the lists of all movies, the movies of that genre, and the movies not belonging to that genre. Then, for every genre, we use these lists and the amount of movies in that genre to create the training, validation, and test splits of the data.

In *Creating my own way to use and load the data* we use these splits and get the labels of every movie for every genre. Then, these are used to create the dataloaders. Now we have everything we need to start training the models. We create the train_model and test_model function. Also, we load a pretrained (on ImageNet) resnet18 model as the starting model. Then, using the train_model function, we train a classifier for every genre.

In *New training code, for using the profiles in the validation*, we start working on a different classifier for every genre. We want to take a look at what happens if we use profile level validation instead of single image level validation (that's what we used for the first kind of models). To do this, we have to load the profiles from 'user_profiles.csv'. Then, we can get the labels of all the profiles, based on the amount of movies in the profile of the genre we're looking at. Now, we again make the lists of positive elements and negative elements for every genre. This was single movies first, but here this contains the profiles. Then, we create the training, validation, and test split. For which we get the labels and create the dataloaders.

With the newly created new_train_model function, we train these other classifiers. We use the same training set as before, but we use the profiles for validation. After training these models as well, we use the test_model function. With this function we can see how well all models perform on the single-image level test split.

In [None]:
%matplotlib inline

In [None]:
# License: BSD
# Author: Sasank Chilamkurthy
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

import csv
import random
import math

from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

plt.ion()   # interactive mode

# Connect to drive and unzip the images


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip 'drive/MyDrive/movies_data_split.zip'

# Load the data and create the train/val/test splits

In [None]:
def updateDict(movies, genre, movie_id):
  movie_list = movies.get(genre, [])
  movie_list.append(movie_id)

  return movie_list

In [None]:
genre_dict = {'Action':0, 'Adventure':1, 'Animation_Children\'s':2, 'Comedy':3, 'Crime':4, 
              'Documentary':5, 'Drama':6, 'Fantasy_Sci-Fi':7, 'Film-Noir':8, 'Horror_Thriller':9, 
              'Musical':10, 'Mystery':11, 'Romance':12, 'War':13, 'Western':14}
row_names = ['movie_id', 'genre_array']
count = {}

# Separate list for all movies, movies that have the genre, and those that don't. Finally, the movies used for testing
all_movies = {}
movies = {}
non_movies = {}
test_movies = {}

# Dictionary for every movie, for every genre. To be used later, in loading the user's movies.
movie_labels = {} # movie_labels[movie][genre]

with open('drive/MyDrive/movie_labels.csv', 'r', encoding = "ISO-8859-1") as f:
    reader = csv.DictReader(f, fieldnames=row_names, delimiter=',')
    for row in reader:
      movie_id = int(row['movie_id'])
        
      my_table = row['genre_array'].maketrans('','','[ ]')
      genre_string_array = list(row['genre_array'].translate(my_table).split(','))  
      genre_array = [int(s) for s in genre_string_array]

      # Make movie_labels list
      movie_labels[movie_id] = {}

      for genre in genre_dict.keys():
        # if movie_id < 3564:
        if genre_array[genre_dict[genre]] == 1:
          c = count.get(genre, 0)
          count[genre] = c + 1
          
          all_movies[genre] = updateDict(all_movies, genre, movie_id)
          movies[genre] = updateDict(movies, genre, movie_id)

          # Make movie_labels list
          movie_labels[movie_id][genre] = 1

        else:
          all_movies[genre] = updateDict(all_movies, genre, movie_id)
          non_movies[genre] = updateDict(non_movies, genre, movie_id)

          # Make movie_labels list
          movie_labels[movie_id][genre] = 0

print("Done")

In [None]:
# Visualise what's inside movie_labels and its construction
print(movie_labels)

In [None]:
train_N = {}
val_N   = {}
test_N  = {}

train_split = {}
val_split   = {}
test_split  = {}

for genre in genre_dict.keys():
  train_N[genre] = math.ceil(count[genre]*0.8)
  val_N[genre] = math.floor(count[genre]*0.1)
  test_N[genre] = math.floor(count[genre]*0.1)

  train_split[genre] = movies[genre][:train_N[genre]] + non_movies[genre][:train_N[genre]]
  val_split[genre]   = movies[genre][train_N[genre]:train_N[genre]+val_N[genre]] + non_movies[genre][train_N[genre]:train_N[genre]+val_N[genre]]
  test_split[genre]  = movies[genre][train_N[genre]+val_N[genre]:] + non_movies[genre][train_N[genre]+val_N[genre]:train_N[genre]+val_N[genre]+test_N[genre]]

print(train_N)
print(val_N)
print(test_N)

# Creating my own way to use and load the data

## Creating the dataloaders

In [None]:
partition = {}
train_labels = {}
val_labels   = {}
test_labels  = {}

# Loop over the genres
for genre in genre_dict.keys():
  partition[genre] = {'train':train_split[genre], 'val':val_split[genre], 'test':test_split[genre]}
  train_labels[genre] = {}
  val_labels[genre]   = {}
  test_labels[genre]  = {}

  # Loop over the movies in train_split. For every movie add the label.
  for movie in train_split[genre]:
    if movie in movies[genre]:
      train_labels[genre][movie] = torch.LongTensor([1])
    else:
      train_labels[genre][movie] = torch.LongTensor([0])

  # Loop over the movies in val_split. For every movie add the label.
  for movie in val_split[genre]:
    if movie in movies[genre]:
      val_labels[genre][movie] = torch.LongTensor([1])
    else:
      val_labels[genre][movie] = torch.LongTensor([0])


  # Loop over the movies in test_split. For every movie add the label.
  for movie in test_split[genre]:
    if movie in movies[genre]:
      test_labels[genre][movie] = torch.LongTensor([1])
    else:
      test_labels[genre][movie] = torch.LongTensor([0])

In [None]:
class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, list_IDs, labels, YOUR_TRANSFORM):
    'Initialization'
    self.labels = labels
    self.list_IDs = list_IDs
    self.YOUR_TRANSFORM = YOUR_TRANSFORM

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.list_IDs)

  def __getitem__(self, index):
      'Generates one sample of data'
      # Select sample
      ID = self.list_IDs[index]

      # Load data and get label
      X = self.YOUR_TRANSFORM(Image.open('movies_data_split/' + str(ID) + '.jpg').convert('RGB')) # here X should be a torch.Tensor
      y = self.labels[ID][0] # it should also be a torch tensor torch.LongTensor(self.labels[ID] )

      return X, y

In [None]:
# Data augmentation and normalization for training
# Just normalization for validation

# ZL: maybe start with this setup. If it does not work, we can discuss how to improve.
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),#ZL: think about your own case
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

dataloaders = {}
dataset_sizes = {}

for genre in genre_dict.keys():
  train_set = Dataset(partition[genre]['train'], train_labels[genre], data_transforms['train'])
  train_generator = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True, num_workers=2)

  val_set = Dataset(partition[genre]['val'], val_labels[genre], data_transforms['val'])
  val_generator = torch.utils.data.DataLoader(val_set, batch_size=64, shuffle=True, num_workers=2)

  test_set = Dataset(partition[genre]['test'], test_labels[genre], data_transforms['test'])
  test_generator = torch.utils.data.DataLoader(test_set, batch_size=64, shuffle=True, num_workers=2)

  dataset_sizes[genre] = {x: len(partition[genre][x]) for x in ['train', 'val', 'test']}
  
  # class_names
  class_names=[0,1]

  dataloaders[genre] = {'train':train_generator,
                        'val':val_generator,
                        'test':test_generator}
               
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Train_model function

In [None]:
def train_model(model, criterion, optimizer, scheduler, genre, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0


            # Iterate over data.
            for inputs, labels in dataloaders[genre][phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[genre][phase]
            epoch_acc = running_corrects.double() / dataset_sizes[genre][phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

## Test_model function

In [None]:
def test_model(model, genre):
  model.eval()   # Set model to evaluate mode
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  if genre in ['Mystery', 'Romance', 'War']:
    device = torch.device('cpu')
  
  running_corrects = 0

  # Iterate over data.
  for inputs, labels in dataloaders[genre]['test']:
    inputs = inputs.to(device)
    labels = labels.to(device)

    outputs = model(inputs)
    _, preds = torch.max(outputs, 1)

    running_corrects += torch.sum(preds == labels.data)
  
  acc = running_corrects.double() / dataset_sizes[genre]['test']
  print('test Acc: {:.4f}'.format(acc))

  return preds

## Load a pretrained model and reset final fully connected layer.




In [None]:
model_ft = models.resnet18(pretrained=True) 
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Linear(num_ftrs, 2)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

## Train the models




In [None]:
trained_models = {}

In [None]:
for genre in genre_dict.keys():
  print(genre)
  trained_models[genre] = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, 
                          genre, num_epochs=25)
  torch.save(trained_models[genre], '{}_model'.format(genre))

# New training code, for using the profiles in the validation.

## Load the profiles

In [None]:
row_names = ['user', 'movies']
user_labels = {}

with open('drive/MyDrive/user_profiles.csv', 'r', encoding = "ISO-8859-1") as f:
    reader = csv.DictReader(f, fieldnames=row_names, delimiter=',')
    for row in reader:
      user = row['user']

      my_table = row['movies'].maketrans('', '', '[ ]')
      movie_string_array = list(row['movies'].translate(my_table).split(','))
      movies = [int(m) for m in movie_string_array]

      user_labels[user] = {}

      for genre in genre_dict.keys():
        user_labels[user][genre] = {}

        for movie in movies:
          if movie != 1107:
            user_labels[user][genre][movie] = movie_labels[movie][genre]

## The true labels of the profiles


In [None]:
true_labels = {}

for user in user_labels.keys():
  true_labels[user] = {}
  for genre in genre_dict.keys():
    movies = list(user_labels[user][genre].keys())
    count = 0
    for movie in movies:
      count += user_labels[user][genre][movie]

    fraction = count/len(movies)
    if fraction >= 0.25:
      true_labels[user][genre] = 1
    else:
      true_labels[user][genre] = 0

## The list of sampled users

In [None]:
# Get the count to make train, val, test splits.
count = {}
users = {}
non_users = {}

for genre in genre_dict.keys():
  count[genre] = 0
  users[genre] = []
  non_users[genre] = []

  for u in true_labels.keys():
    if true_labels[u][genre] == 1:
      users[genre].append(u)
      count[genre] +=1
    else:
      non_users[genre].append(u)
  
  # The count should be the amount of the smallest side (1's or 0's)
  # to be able to get an equal amount of 1's and 0's in every split.
  if count[genre] > len(true_labels)/2: # Comedy, Drama
    count[genre] = len(true_labels) - count[genre]
  print(genre, count[genre])

## The train/val/test splits

In [None]:
train_N = {}
val_N   = {}
test_N  = {}

train_split = {}
val_split   = {}
test_split  = {}

for genre in genre_dict.keys():
  val_N[genre] = math.floor(count[genre]*0.1)
  test_N[genre] = math.floor(count[genre]*0.1)
  train_N[genre] = count[genre] - val_N[genre] - test_N[genre]

  if val_N[genre] == 0: # There are not a lot of profiles with documentary as 1.
    val_N[genre] = 1
    test_N[genre] = 1
    train_N[genre] = count[genre] - 2

  train_split[genre] = users[genre][:train_N[genre]] + non_users[genre][:train_N[genre]]
  val_split[genre]   = users[genre][train_N[genre]:train_N[genre]+val_N[genre]] + non_users[genre][train_N[genre]:train_N[genre]+val_N[genre]]
  test_split[genre]  = users[genre][train_N[genre]+val_N[genre]:] + non_users[genre][train_N[genre]+val_N[genre]:train_N[genre]+val_N[genre]+test_N[genre]]

print(train_N)
print(val_N)
print(test_N)

## Labels for dataloaders

In [None]:
labels = {}
movies = {}
movie_index = {}
profile_labels = {'train':{},
                  'val':{},
                  'test':{}}

for genre in genre_dict.keys():
  labels[genre] = {}
  movies[genre] = {}
  movie_index[genre] = {}
  profile_labels['train'][genre] = {}
  profile_labels['val'][genre] = {}
  profile_labels['test'][genre] = {}

  # Make the movie list and index list for val split.
  movies[genre]['train'] = []
  for user in train_split[genre]:   
    count = 0
    for movie in list(user_labels[user][genre].keys()):
      movies[genre]['train'].append(movie)
      count += movie_labels[movie][genre]

    fraction = count / len(user_labels[user][genre])
    if fraction >= 0.25:
      profile_labels['train'][genre][user] = 1
    else:
      profile_labels['train'][genre][user] = 0


  # Make the movie list and index list for val split.
  movies[genre]['val'] = []
  for user in val_split[genre]:   
    count = 0
    for movie in list(user_labels[user][genre].keys()):
      movies[genre]['val'].append(movie)
      count += movie_labels[movie][genre]

    fraction = count / len(user_labels[user][genre])
    if fraction >= 0.25:
      profile_labels['val'][genre][user] = 1
    else:
      profile_labels['val'][genre][user] = 0

  # Make the movie list and index list for test split.
  movies[genre]['test'] = []
  for user in test_split[genre]:
    count = 0
    for movie in list(user_labels[user][genre].keys()):
      movies[genre]['test'].append(movie)
      count += movie_labels[movie][genre]
    
    fraction = count / len(user_labels[user][genre])
    if fraction >= 0.25:
      profile_labels['test'][genre][user] = 1
    else:
      profile_labels['test'][genre][user] = 0
    
  # Remove duplicates
  for phase in ['train', 'val', 'test']:
    movies[genre][phase] = list(set(movies[genre][phase]))
    labels[genre][phase] = {}
    movie_index[genre][phase] = {}

    index = 0
    for movie in movies[genre][phase]:
      movie_index[genre][phase][movie] = index
      labels[genre][phase][movie] = torch.LongTensor([movie_labels[movie][genre]])
      index += 1

## Make dataloaders

In [None]:
profile_dataset_sizes = {}
profile_dataloaders = {}

for phase in ['train', 'val', 'test']:
  profile_dataset_sizes[phase] = {}
  profile_dataloaders[phase]   = {}

  for genre in genre_dict.keys():
    dataset = Dataset(movies[genre][phase], labels[genre][phase], data_transforms[phase])
    profile_generator = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=False, num_workers=2)


    profile_dataloaders[phase][genre] = profile_generator

  for user in user_labels.keys():
    profile_dataset_sizes[phase][user] = len(user_labels[user][genre])

# New_train_model function

In [None]:
def new_train_model(model, criterion, optimizer, scheduler, genre, profiles, true_labels, movie_index, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            if phase == 'train':
              # Iterate over data.
              for inputs, labels in dataloaders[genre][phase]:
                  inputs = inputs.to(device)
                  labels = labels.to(device)

                  # zero the parameter gradients
                  optimizer.zero_grad()

                  # forward
                  # track history if only in train
                  with torch.set_grad_enabled(phase == 'train'):
                      outputs = model(inputs)
                      _, preds = torch.max(outputs, 1)
                      loss = criterion(outputs, labels)

                      # backward + optimize only if in training phase
                      loss.backward()
                      optimizer.step()

                  # statistics
                  running_loss += loss.item() * inputs.size(0)
                  running_corrects += torch.sum(preds == labels.data)
              
              scheduler.step()

              epoch_loss = running_loss / dataset_sizes[genre][phase]
              epoch_acc = running_corrects.double() / dataset_sizes[genre][phase]

            else: #phase == 'val'
              predictions = []

              # Iterate over data.              
              for inputs, labels in profile_dataloaders['val'][genre]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                
                predictions += [int(p) for p in list(preds)]
                loss = criterion(outputs, labels)
                
                # statistics
                running_loss += loss.item() * inputs.size(0)          # Not sure if this running_loss should be here, same with running_corrects

              for user in profiles:
                count = 0
                movies = user_labels[user][genre]

                # count is the amount of movies positively classified for this genre
                for movie in movies:
                  index = movie_index[int(movie)]
                  count += predictions[index]

                fraction = count/profile_dataset_sizes[phase][user]
                
                if fraction >= 0.25:
                  profile_pred = 1
                else:
                  profile_pred = 0
                
                # print(user, fraction, profile_pred, true_labels[genre][user])
                # print(count, profile_dataset_sizes[phase][user])
                # print()

                if profile_pred == true_labels[genre][user]:
                  running_corrects += 1
                else:
                  running_corrects += 0

              # epoch_loss = running_loss / len(profiles)
              epoch_loss = running_loss / dataset_sizes[genre][phase]
              epoch_acc = running_corrects / len(profiles) # running_corrects.double() # Because it was a tensor.

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

## Load a pretrained model and reset final fully connected layer

In [None]:
new_model_ft = models.resnet18(pretrained=True) 
num_ftrs = new_model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
new_model_ft.fc = nn.Linear(num_ftrs, 2)

new_model_ft = new_model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(new_model_ft.parameters(), lr=0.01, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

## Train the models

In [None]:
new_trained_models = {}

In [None]:
for genre in genre_dict.keys():
  print(genre)
  new_trained_models[genre] = new_train_model(new_model_ft, criterion, optimizer_ft, exp_lr_scheduler, 
                          genre, val_split[genre], profile_labels['val'], movie_index[genre]['val'], num_epochs=10)
  torch.save(new_trained_models[genre], 'new_{}_model'.format(genre))

# Test the models

In [None]:
# Run this block if you save the models already
trained_models = {}
new_trained_models = {}
for genre in genre_dict.keys():
  trained_models[genre] = torch.load('drive/MyDrive/models/{}_model'.format(genre), map_location=torch.device('cpu'))
  new_trained_models[genre] = torch.load('drive/MyDrive/models/new_{}_model'.format(genre), map_location=torch.device('cpu'))

In [None]:
for genre in genre_dict.keys():
  print(genre)
  test_model(trained_models[genre], genre)
  test_model(new_trained_models[genre], genre)