In [1]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchvision import transforms
from torch.autograd import Variable

import numpy as np
import pandas as pd
import difflib
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

from torch.utils.data import Dataset, DataLoader

from IPython import display
import string
import dataloader
from dataloader import MoviesDataset
#load data
movies = pd.read_csv('tmdb_5000_movies.csv')
movies = movies.iloc[:-3]
features = ['title', 'genres', 'keywords', 'original_language', 'vote_average']

# make dataset
dataset = MoviesDataset(movies)

#clean null values
for feature in features:
    movies[feature] = movies[feature].fillna('Unknown')

scaler = StandardScaler()

weight_title = 2.0  # Increase the weight of title
weight_genre = 1.5  # Increase the weight of genre

vectorizer_title = TfidfVectorizer()
vectorizer_genre = TfidfVectorizer()
vectorizer_keywords = TfidfVectorizer()
vectorizer_language = TfidfVectorizer()

title_features = movies['title']
genre_features = movies['genres']
keywords_features = movies['keywords']
language_features = movies['original_language']

title_vec = vectorizer_title.fit_transform(title_features)
genre_vec = vectorizer_genre.fit_transform(genre_features)
keywords_vec = vectorizer_keywords.fit_transform(keywords_features)
language_vec = vectorizer_language.fit_transform(language_features)

title_vec_scaled = scaler.fit_transform(title_vec.toarray())
genre_vec_scaled = scaler.fit_transform(genre_vec.toarray())
keywords_vec_scaled = scaler.fit_transform(keywords_vec.toarray())
language_vec_scaled = scaler.fit_transform(language_vec.toarray())
vote_avg = movies['vote_average'].values.reshape(-1, 1)


feat_vectors = np.concatenate([
    weight_title * title_vec_scaled,
    weight_genre * genre_vec_scaled,
    keywords_vec_scaled,
    language_vec_scaled
], axis=1)
feat_vectors = np.concatenate([feat_vectors, vote_avg], axis=1)

svd = TruncatedSVD(n_components=32) 
reduced_features = svd.fit_transform(feat_vectors).astype(np.float32)

similarity = cosine_similarity(reduced_features)
print(similarity.shape)

(4800, 4800)


In [2]:
from network import Reccomender

def init_weights(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1 or classname.find('BatchNorm') != -1:
        m.weight.data.normal_(0.00, 0.02)
        
#set up rec
batch_size = 128
rec = Reccomender(batch_size)
rec.apply(init_weights)

#load model if possible
loadModel = False
if os.path.exists('./model.pth') and loadModel:
    rec.load_state_dict(torch.load('./model.pth'))

if torch.cuda.is_available():
    rec.cuda()
    
optimizer = Adam(rec.parameters(), lr=1e-3, weight_decay=1e-5)

In [3]:
#training functions
loss = nn.L1Loss()
movie_titles = movies['title'].tolist()

def train_network(optimizer, results, real_data, n_batch):
    optimizer.zero_grad()

    gt_similar =  similarity[n_batch]
    most_similar_idxs = np.argsort(gt_similar)[::-1]
    similar_movs = [movies.iloc[most_similar_idxs[1]]['title'], 
                    movies.iloc[most_similar_idxs[2]]['title'], 
                    movies.iloc[most_similar_idxs[3]]['title']]
    print(f"Ground-Truth Similarity | `{movies.iloc[most_similar_idxs[0]]['title']}` : `{similar_movs}`")
    #output of neural network should be an integer close to MSM
    sample = results[0].detach().cpu().numpy()
    sample = sample.reshape(1, -1)
    sim = torch.tensor(cosine_similarity(sample, reduced_features))
    closest_index = torch.argmax(sim).item()
        
    print(f"Reccomendation | {movies.iloc[closest_index]['title']} : {dataset.GetSimilarItems(sample, top_n=3)}")
    error = loss(real_data, results.unsqueeze(1))
    error.backward()
    optimizer.step()
    
    prediction = rec(real_data)
    return error, prediction

In [4]:
num_batches = len(dataset)//batch_size
num_epochs = 300

train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
#training loop
for epoch in range(num_epochs):
    for n_batch, batch in enumerate(train_loader):
        real_data = Variable(batch)
        if torch.cuda.is_available(): 
            real_data = real_data.cuda()
        real_data = real_data.unsqueeze(1)
        print(real_data.shape)
        result = rec(real_data)
        print("NETWORK FORWARDED")
        # print(f"GT{real_data[0]} \n RESULT{result[0]}")
        # print(f"result shape - {result.shape}")
        err, pred = train_network(optimizer, result, real_data, n_batch)
        result = result.detach()
        #show progress
        print(f"iter: {n_batch}/{num_batches} of epoch {epoch}/{num_epochs}")
        print(f"err: {err:.6f}\n")
    display.clear_output(True)
torch.save(rec.state_dict(), './reccomender_model.pth')

torch.Size([128, 1, 32])
NETWORK FORWARDED
Ground-Truth Similarity | `avatar` : `['tomorrowland', 'the time machine', 'insurgent']`
Reccomendation | the invasion : ['planet of the apes', 'battle for the planet of the apes', 'fantastic four']
iter: 0/37 of epoch 299/300
err: 1.454302

torch.Size([128, 1, 32])
NETWORK FORWARDED
Ground-Truth Similarity | `pirates of the caribbean at worlds end` : `['pirates of the caribbean on stranger tides', 'pirates of the caribbean dead mans chest', 'pirates of the caribbean the curse of the black pearl']`
Reccomendation | rango : ['angels  demons', 'cradle 2 the grave', 'the devils tomb']
iter: 1/37 of epoch 299/300
err: 1.727018

torch.Size([128, 1, 32])
NETWORK FORWARDED
Ground-Truth Similarity | `spectre` : `['the art of war', 'kickass 2', 'the prince']`
Reccomendation | the invasion : ['dragonball evolution', 'planet of the apes', 'mutant world']
iter: 2/37 of epoch 299/300
err: 1.378442

torch.Size([128, 1, 32])
NETWORK FORWARDED
Ground-Truth Si