# Pipeline
After importing the necessary functions we load the data from 'movie_labels.csv' to get all the movies and labels for every genre. Also, we get the amount of movies for every genre for visualisation at the end. 

Then, we create the use_model function and the preporatory things for creating the dataloaders. Also, we load in the trained classifiers (both single-image validation and profile validation).

Now, we load the data from 'user_profiles.csv' to get the movies (and labels for these movies) for every profile. Afterwards, we create a dataloader with all the movies. This is because, for every genre, we want to make a single prediction for all movies and then use that single prediction for every profile with that movie. This is to make sure the only difference is in the profiles and not the classification of the same movie between different profiles.

With this preporatory work, we can finally start getting the predictions for all the movies. These predictions are then loaded into the 'movie_dictionary.csv' file, which means we don't have to redo all this work multiple times. Assuming this worked, we can now load the results from 'movie_dictionary' into the variable 'old_movie_dictionary' and all the results from 'new_movie_dictionary' into the variable with the same name.

With these predictions we can start testing to see the results. The testing is done using profiles instead of single images. So, we have to first take a test set of the profiles. This is done by taking the true labels of the profiles and then splitting them up in training, validation, and test sets. From which we only use the test set. Then, we get the true labels of the profiles in the test set, as well as the predicted labels of the profiles in the test set. These can be used to get the accuracies. These accuracies can then be visualised together with the (relative) amount of movies for every genre.




# Imports and connecting to Google drive

In [None]:
%matplotlib inline
#ZL: plot within the notebook 
# for colab, choose runtime, select GPU

In [None]:
# License: BSD
# Author: Sasank Chilamkurthy
# Zhuoran and me changed some things

from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
from sklearn import metrics
import time
import os
import copy

import csv
import random
import math

from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

plt.ion()   # interactive mode

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip 'drive/MyDrive/movies_data_split.zip'

In [None]:
def updateDict(dictionary, genre, element):
  dict_list = dictionary.get(genre, [])
  dict_list.append(element)

  return dict_list

# Get all the movies and make lists per genre

In [None]:
genre_dict = {'Action':0, 'Adventure':1, 'Animation_Children\'s':2, 'Comedy':3, 'Crime':4, 
              'Documentary':5, 'Drama':6, 'Fantasy_Sci-Fi':7, 'Film-Noir':8, 'Horror_Thriller':9, 
              'Musical':10, 'Mystery':11, 'Romance':12, 'War':13, 'Western':14}
row_names = ['movie_id', 'genre_array']
count = {}

# Separate list for all movies, movies that have the genre, and those that don't. Finally, the movies used for testing
all_movies = {}
movies = {}
non_movies = {}

# Dictionary for every movie, for every genre. To be used later, in loading the user's movies.
movie_labels = {} # movie_labels[movie][genre]

# Keep count of how many movies are in every genre
genre_count = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

with open('drive/MyDrive/movie_labels.csv', 'r', encoding = "ISO-8859-1") as f:
    reader = csv.DictReader(f, fieldnames=row_names, delimiter=',')
    for row in reader:
      movie_id = int(row['movie_id'])
        
      my_table = row['genre_array'].maketrans('','','[ ]')
      genre_string_array = list(row['genre_array'].translate(my_table).split(','))  
      genre_array = [int(s) for s in genre_string_array]

      # Make movie_labels list
      movie_labels[movie_id] = {}

      # Add the count to the existing array
      genre_count += genre_array

      for genre in genre_dict.keys():
        # if movie_id < 3564:
        if genre_array[genre_dict[genre]] == 1:
          c = count.get(genre, 0)
          count[genre] = c + 1
          
          all_movies[genre] = updateDict(all_movies, genre, movie_id)
          movies[genre] = updateDict(movies, genre, movie_id)

          # Make movie_labels list
          movie_labels[movie_id][genre] = 1

        else:
          all_movies[genre] = updateDict(all_movies, genre, movie_id)
          non_movies[genre] = updateDict(non_movies, genre, movie_id)

          # Make movie_labels list
          movie_labels[movie_id][genre] = 0

print(genre_count)
print("Done")

# Creating my own way to use and load the data

In [None]:
class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, list_IDs, labels, YOUR_TRANSFORM):
    'Initialization'
    self.labels = labels
    self.list_IDs = list_IDs
    self.YOUR_TRANSFORM = YOUR_TRANSFORM

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.list_IDs)

  def __getitem__(self, index):
      'Generates one sample of data'
      # Select sample
      ID = self.list_IDs[index]

      # Load data and get label
      X = self.YOUR_TRANSFORM(Image.open('movies_data_split/' + str(ID) + '.jpg').convert('RGB')) # here X should be a torch.Tensor
      y = self.labels[ID][0] # it should also be a torch tensor torch.LongTensor(self.labels[ID] )

      return X, y

In [None]:
# Normalization
data_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
  
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def use_model(model, genre):
  model.eval()   # Set model to evaluate mode

  # For Mystery, Romance, and War we trained without GPU, so also have to do device = cpu.
  # device = torch.device('cpu')
  
  # Iterate over data.
  predictions = []
  for inputs, labels in movie_dataloaders[genre]:
    inputs = inputs.to(device)
    labels = labels.to(device)

    outputs = model(inputs)
    _, preds = torch.max(outputs, 1)
    predictions.append(preds)

  final_results = torch.cat(predictions).tolist()

  return final_results

# Load the trained models





In [None]:
trained_models = {}
new_trained_models = {}
for genre in genre_dict.keys(): 
  # trained_models[genre] = torch.load('drive/MyDrive/models/{}_model'.format(genre))
  # new_trained_models[genre] = torch.load('drive/MyDrive/models/new_{}_model'.format(genre))
  # Use these instead if you are working without GPU:
  trained_models[genre] = torch.load('drive/MyDrive/models/{}_model'.format(genre), map_location=torch.device('cpu'))
  new_trained_models[genre] = torch.load('drive/MyDrive/models/new_{}_model'.format(genre), map_location=torch.device('cpu'))

# Set-based classification:


## Get the true labels of all movies in 'user_labels' per profile

In [None]:
row_names = ['user_id', 'movies']
user_genre_count = {}
user_profiles = {}
user_labels = {}

count_amount_movies = {}
missers = []

movie_dict = {}

with open('drive/MyDrive/user_profiles.csv', 'r', encoding = "ISO-8859-1") as f:
    reader = csv.DictReader(f, fieldnames=row_names, delimiter=',')
    for row in reader:
      user = row['user_id']

      my_table = row['movies'].maketrans('','','[ ]')
      movie_string_array = list(row['movies'].translate(my_table).split(','))  
      movies = [int(s) for s in movie_string_array]
      movie_dict[user] = movies
      
      user_labels[user] = {'Action':{}, 'Adventure':{}, 'Animation_Children\'s':{}, 'Comedy':{}, 'Crime':{}, 
              'Documentary':{}, 'Drama':{}, 'Fantasy_Sci-Fi':{}, 'Film-Noir':{}, 'Horror_Thriller':{}, 
              'Musical':{}, 'Mystery':{}, 'Romance':{}, 'War':{}, 'Western':{}}
      
      # For every movie in this user, add the true label per genre.
      for movie in movies:
        if movie in movie_labels.keys():
          for genre in genre_dict.keys():
            label = torch.LongTensor([movie_labels[movie][genre]])
            user_labels[user][genre][movie] = label
            count_amount_movies[user] = count_amount_movies.get(user, 0) + 1

## Get all movies into DataLoaders

In [None]:
row_names = ['movie_id', 'labels']

movie_dataset_sizes = {}
movie_dataloaders = {}

for genre in genre_dict.keys():
  labels = {}
  for movie in movie_labels.keys():
    label = movie_labels[movie][genre]
    labels[movie] = torch.LongTensor([label])
  movie_dataset = Dataset(all_movies[genre], labels, data_transforms)
  movie_generator = torch.utils.data.DataLoader(movie_dataset, batch_size=4, shuffle=True, num_workers=2)

  movie_dataset_sizes[genre] = len(all_movies[genre])
  movie_dataloaders[genre] = movie_generator

## Get the predictions for every movie

### The following 3 blocks are used for the classification and writing it to a csv file. They are only necessary in the beginning (or to reset the predictions)

In [None]:
results = {}
for genre in genre_dict.keys():
# Run these one by one, so once for the trained_models and once for new_trained_models
  results[genre] = use_model(trained_models[genre], genre)
  # results[genre] = use_model(new_trained_models[genre], genre)
  print(genre)
  print(results[genre])

In [None]:
movie_dictionary = {}

# Could be any genre, as length is the same for all of them.
for i in range(len(all_movies['Action'])):
  labels_per_movie = []
  for genre in genre_dict.keys():
    labels_per_movie.append(results[genre][i])
  
  movie_dictionary[all_movies[genre][i]] = labels_per_movie

print(movie_dictionary)
print(len(movie_dictionary))

In [None]:
# This one is the movie_dictionary, for the new_trained_models
for movie in movie_dictionary.keys():
  with open('movie_dictionary.csv', 'a', newline='') as out_csv:
                    writer = csv.writer(out_csv, delimiter=',')
                    writer.writerow([movie, movie_dictionary[movie]])

# # This one is the new_movie_dictionary, for the new_trained_models
# for movie in movie_dictionary.keys():
#   with open('new_movie_dictionary.csv', 'a', newline='') as out_csv:
#                     writer = csv.writer(out_csv, delimiter=',')
#                     writer.writerow([movie, movie_dictionary[movie]])

### The following blocks are to load the predictions from the 'movie_dictionary.csv' file. This can be used if you already have predictions in the file

In [None]:
rownames = ['movie_id', 'labels']
old_movie_dictionary = {}

with open('drive/MyDrive/movie_dictionary.csv', 'r', encoding = "ISO-8859-1") as f:
    reader = csv.DictReader(f, fieldnames=row_names, delimiter=',')
    for row in reader:
        movie_id = int(row['movie_id'])
        
        my_table = row['labels'].maketrans('','','[ ]')
        label_string_array = list(row['labels'].translate(my_table).split(','))  
        labels = [int(s) for s in label_string_array]
        
        old_movie_dictionary[movie_id] = labels

print(old_movie_dictionary)
print("Done")

In [None]:
rownames = ['movie_id', 'labels']
new_movie_dictionary = {}

with open('drive/MyDrive/new_movie_dictionary.csv', 'r', encoding = "ISO-8859-1") as f:
    reader = csv.DictReader(f, fieldnames=row_names, delimiter=',')
    for row in reader:
        movie_id = int(row['movie_id'])
        
        my_table = row['labels'].maketrans('','','[ ]')
        label_string_array = list(row['labels'].translate(my_table).split(','))  
        labels = [int(s) for s in label_string_array]
        
        new_movie_dictionary[movie_id] = labels

print(new_movie_dictionary)
print("Done")

# Testing and accuracy

## Make the test split

In [None]:
true_labels = {}

for user in user_labels.keys():
  true_labels[user] = {}
  for genre in genre_dict.keys():
    movies = list(user_labels[user][genre].keys())
    count = 0
    for movie in movies:
      count += user_labels[user][genre][movie]

    fraction = count/len(movies)
    if fraction >= 0.25:
      true_labels[user][genre] = 1
    else:
      true_labels[user][genre] = 0

In [None]:
# Get the count to make train, val, test splits.
count = {}
users = {}
non_users = {}

for genre in genre_dict.keys():
  count[genre] = 0
  users[genre] = []
  non_users[genre] = []

  for u in true_labels.keys():
    if true_labels[u][genre] == 1:
      users[genre].append(u)
      count[genre] +=1
    else:
      non_users[genre].append(u)
  
  # The count should be the amount of the smallest side (1's or 0's)
  # to be able to get an equal amount of 1's and 0's in every split.
  if count[genre] > len(true_labels)/2:
    count[genre] = len(true_labels) - count[genre]
  print(genre, count[genre])

In [None]:
train_N = {}
val_N   = {}
test_N  = {}

# train_split = {}
# val_split   = {}
test_split  = {}

for genre in genre_dict.keys():
  val_N[genre] = math.floor(count[genre]*0.1)
  test_N[genre] = math.floor(count[genre]*0.1)
  train_N[genre] = count[genre] - val_N[genre] - test_N[genre]

  if val_N[genre] == 0: # There are not a lot of profiles with documentary as 1.
    val_N[genre] = 1
    test_N[genre] = 1
    train_N[genre] = count[genre] - 2

  # train_split[genre] = users[genre][:train_N[genre]] + non_users[genre][:train_N[genre]]
  # val_split[genre]   = users[genre][train_N[genre]:train_N[genre]+val_N[genre]] + non_users[genre][train_N[genre]:train_N[genre]+val_N[genre]]
  test_split[genre]  = users[genre][train_N[genre]+val_N[genre]:] + non_users[genre][train_N[genre]+val_N[genre]:train_N[genre]+val_N[genre]+test_N[genre]]

print(train_N)
print(val_N)
print(test_N)

## Get the true labels for the users in the test split.

In [None]:
profile_true_labels = {}

for genre in genre_dict.keys():
  profile_true_labels[genre] = {}

  for user in test_split[genre]:
    movies = list(user_labels[user][genre].keys())
    count = 0
    for movie in movies:
      count += user_labels[user][genre][movie]
      
    fraction = count/len(movies)
    if fraction >= 0.25:
      profile_true_labels[genre][user] = 1
    else:
      profile_true_labels[genre][user] = 0

In [None]:
print(profile_true_labels)

## Get the predicted labels for the users

In [None]:
old_profile_predicted_labels = {}
new_profile_predicted_labels = {}

for genre in genre_dict.keys():
  old_profile_predicted_labels[genre] = {}
  new_profile_predicted_labels[genre] = {}

  for user in test_split[genre]:
    movies = list(user_labels[user][genre].keys())
    old_count = 0
    new_count = 0

    for movie in movies:
      old_count += old_movie_dictionary[movie][genre_dict[genre]]
      new_count += new_movie_dictionary[movie][genre_dict[genre]]

    old_fraction = old_count/len(movies)
    if old_fraction >= 0.25:
      old_profile_predicted_labels[genre][user] = 1
    else:
      old_profile_predicted_labels[genre][user] = 0

    new_fraction = new_count/len(movies)
    if new_fraction >= 0.25:
      new_profile_predicted_labels[genre][user] = 1
    else:
      new_profile_predicted_labels[genre][user] = 0

In [None]:
print(old_profile_predicted_labels)
print(new_profile_predicted_labels)

# Get the accuracy for the genres (profiles)

In [None]:
old_profiles_acc = []
new_profiles_acc = []

for genre in genre_dict.keys():
  old_count = 0
  new_count = 0

  for user in test_split[genre]:
    old_pred = old_profile_predicted_labels[genre][user]
    new_pred = new_profile_predicted_labels[genre][user]
    
    true = profile_true_labels[genre][user]
    old_count += (old_pred == true)
    new_count += (new_pred == true)

  old_accuracy = old_count / len(old_profile_predicted_labels[genre]) 
  old_profiles_acc.append(old_accuracy)
  
  new_accuracy = new_count / len(new_profile_predicted_labels[genre])  
  new_profiles_acc.append(new_accuracy)
  

print('Done')

# Visualise the similarity for resulting F1 scores and the amount of movies per genre

In [None]:
print(old_profiles_acc)
print(new_profiles_acc)

In [None]:
# Visualise the distribution of genres over the movies
genres = ['Action', 'Adventure', 'Anim/Child', 'Comedy', 'Crime', 
          'Documentary', 'Drama', 'Fant/SF', 'Film-Noir', 'Hor/Thri', 
          'Musical', 'Mystery', 'Romance', 'War', 'Western']

x = np.arange(len(genres))
width = 0.3

fig, ax = plt.subplots(figsize=(16,8))

# Normalize genre_count to plot them together with F1 scores.
genre_count_normalized = [g/len(movie_labels) for g in genre_count]

rects1 = ax.bar(x - width, genre_count_normalized, width, label = 'Percentage of movies per genre')
rects2 = ax.bar(x, old_profiles_acc, width, label = 'Accuracy of set-based classifier with single-image validation')
rects3 = ax.bar(x + width, new_profiles_acc, width, label = 'Accuracy of set-based classifier with profile validation')

ax.set_ylabel('Accuracy, percentages')
ax.set_title('Accuracy on test set of profiles and percentage of movie count per genre')
ax.set_xticks(x)
ax.set_xticklabels(genres)
ax.legend()

plt.xlabel('Genres')
plt.show()