In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from collections import deque

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import coo_matrix
from scipy.sparse import vstack

import torch

In [2]:
# Load data for all movies
movie_titles = pd.read_csv('./netflix-prize-data/movie_titles.csv', 
                           encoding = 'ISO-8859-1', 
                           header = None, 
                           names = ['Id', 'Year', 'Name']).set_index('Id')

print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
movie_titles.sample(5)

Shape Movie-Titles:	(17770, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
10429,2003.0,Final Destination 2
4149,2004.0,Scooby-Doo 2: Monsters Unleashed
11423,1998.0,Little Dieter Needs to Fly
4201,1996.0,Walking and Talking
14705,1996.0,Tai Chi 2


In [3]:
# Load a movie metadata dataset
movie_metadata = pd.read_csv('./the-movies-dataset/movies_metadata.csv', low_memory=False)[['original_title', 'overview', 'vote_count']].set_index('original_title').dropna()
# Remove the long tail of rarly rated moves
movie_metadata = movie_metadata[movie_metadata['vote_count']>10].drop('vote_count', axis=1)

print('Shape Movie-Metadata:\t{}'.format(movie_metadata.shape))
movie_metadata.sample(5)

Shape Movie-Metadata:	(21604, 1)


Unnamed: 0_level_0,overview
original_title,Unnamed: 1_level_1
G.O.R.A.,A slick young Turk kidnapped by extraterrestri...
"3 ½ Minutes, 10 Bullets","Black Friday, the day after Thanksgiving Novem..."
"Nachts, wenn Dracula erwacht",Jess Franco's version of the Bram Stoker class...
The Wild Geese,A British multinational company seeks to overt...
Start the Revolution Without Me,An account of the adventures of two sets of id...


In [4]:
# Load single data-file
df_raw = pd.read_csv('./netflix-prize-data/combined_data_1.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])


# Find empty rows to slice dataframe for each movie
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)


# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    
    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()
        
    # Create movie_id column
    tmp_df['Movie'] = movie_id
    
    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
df = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)

Shape User-Ratings:	(24053764, 4)


Unnamed: 0,User,Rating,Date,Movie
11866712,499477,5.0,2005-07-01,2290
7119825,708701,4.0,2004-09-06,1428
17263046,1769985,5.0,2005-06-06,3320
4702585,101930,4.0,2005-05-18,937
5141950,1122900,2.0,2005-11-07,1046


In [5]:
# Get data
data = movie_titles['Year'].value_counts().sort_index()

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movies Grouped By Year Of Release'.format(movie_titles.shape[0]),
              xaxis = dict(title = 'Release Year'),
              yaxis = dict(title = 'Movies'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [6]:
# Get data
data = df['Date'].value_counts()
data.index = pd.to_datetime(data.index)
data.sort_index(inplace=True)

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movie-Ratings Grouped By Day'.format(df.shape[0]),
              xaxis = dict(title = 'Date'),
              yaxis = dict(title = 'Ratings'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [7]:
##### Ratings Per Movie #####
# Get data
data = df.groupby('Movie')['Rating'].count().clip(upper=9999)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 10000,
                                  size = 100),
                     marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per Movie (Clipped at 9999)',
                   xaxis = dict(title = 'Ratings Per Movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)



##### Ratings Per User #####
# Get data
data = df.groupby('User')['Rating'].count().clip(upper=199)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 200,
                                  size = 2),
                     marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per User (Clipped at 199)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [8]:
# Filter sparse movies
min_movie_ratings = 10000
filter_movies = (df['Movie'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

# Filter sparse users
min_user_ratings = 200
filter_users = (df['User'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filterd = df[(df['Movie'].isin(filter_movies)) & (df['User'].isin(filter_users))]
del filter_movies, filter_users, min_movie_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filterd.shape))

Shape User-Ratings unfiltered:	(24053764, 4)
Shape User-Ratings filtered:	(4178032, 4)


In [9]:
# Shuffle DataFrame
df_filterd = df_filterd.drop('Date', axis=1).sample(frac=1).reset_index(drop=True)

# Testingsize
n = 100000

# Split train- & testset
df_train = df_filterd[:-n]
df_test = df_filterd[-n:]

In [10]:
# Create a user-movie matrix with empty values
df_p = df_train.pivot_table(index='User', columns='Movie', values='Rating')
print('Shape User-Movie-Matrix:\t{}'.format(df_p.shape))
df_p.sample(3)

Shape User-Movie-Matrix:	(20828, 491)


Movie,8,18,28,30,58,77,83,97,108,111,...,4392,4393,4402,4418,4420,4432,4472,4479,4488,4490
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1076126,,,2.0,4.0,,,3.0,,,,...,,,1.0,,,4.0,2.0,,2.0,3.0
921528,,,4.0,,,,,,,,...,,3.0,,,,,,,,
1362105,,,,,3.0,,,,,4.0,...,,3.0,,,,,5.0,,,


In [11]:
class DotDict(dict):
    """
    a dictionary that supports dot notation 
    as well as dictionary access notation 
    usage: d = DotDict() or d = DotDict({'val1':'first'})
    set attributes: d.val2 = 'second' or d['val2'] = 'second'
    get attributes: d.val2 or d['val2']
    """
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

    def __init__(self, dct):
        for key, value in dct.items():
            if hasattr(value, 'keys'):
                value = DotDict(value)
            self[key] = value


# Create model

In [12]:
# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df['Movie'].unique())}

# Use mapping to get better ids
df['User'] = df['User'].map(user_id_mapping)
df['Movie'] = df['Movie'].map(movie_id_mapping)


##### Combine both datasets to get movies with metadata
# Preprocess metadata
tmp_metadata = movie_metadata.copy()
tmp_metadata.index = tmp_metadata.index.str.lower()

# Preprocess titles
tmp_titles = movie_titles.drop('Year', axis=1).copy()
tmp_titles = tmp_titles.reset_index().set_index('Name')
tmp_titles.index = tmp_titles.index.str.lower()

# Combine titles and metadata
df_id_descriptions = tmp_titles.join(tmp_metadata).dropna().set_index('Id')
df_id_descriptions['overview'] = df_id_descriptions['overview'].str.lower()
del tmp_metadata,tmp_titles


# Filter all ratings with metadata
df_hybrid = df.drop('Date', axis=1).set_index('Movie').join(df_id_descriptions).dropna().drop('overview', axis=1).reset_index().rename({'index':'Movie'}, axis=1)


# Split train- & testset
n = 100000
df_hybrid = df_hybrid.sample(frac=1).reset_index(drop=True)
df_hybrid_train = df_hybrid[:1500000]
df_hybrid_test = df_hybrid[-n:]

In [13]:
import pytorch_lightning as pl
import transformers
from transformers.modeling_distilbert import *
from transformers.tokenization_distilbert import *

In [14]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [15]:
model = model.cuda()

In [16]:
from keras_preprocessing.sequence import pad_sequences

def get_sentence_vector(input_texts):
    input_ids = torch.LongTensor(pad_sequences([
        tokenizer.encode(input_text, add_special_tokens=True)
        for input_text in input_texts
    ])).cuda()
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states[:,0,:].cpu().numpy()

test_vector = get_sentence_vector(['Hello world!', 'it is'])
print(test_vector.shape)

(2, 768)


In [17]:
mapping = {id:i for i, id in enumerate(df_id_descriptions.index)}

In [18]:
from tqdm.auto import tqdm, trange

batch_size=1000

train_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
values = df_hybrid_train['Movie'].values
for start_idx in tqdm(range(0, len(values), batch_size)):
    indices = [mapping[id] for id in values[start_idx:start_idx+batch_size]]
    sentences = [
        str(df_id_descriptions.iloc[id][['overview']].astype(str))
        for id in indices
    ]
    train_tfidf.extend(get_sentence_vector(sentences))

HBox(children=(IntProgress(value=0, max=1500), HTML(value='')))




In [19]:
test_tfidf = []
values = df_hybrid_test['Movie'].values
# Iterate over all movie-ids and save the tfidf-vector
for start_idx in tqdm(range(0, len(values), batch_size)):
    indices = [mapping[id] for id in values[start_idx:start_idx+batch_size]]
    sentences = [
        str(df_id_descriptions.iloc[id][['overview']].astype(str))
        for id in indices
    ]
    test_tfidf.extend(get_sentence_vector(sentences))

HBox(children=(IntProgress(value=0), HTML(value='')))




In [20]:
del model
torch.cuda.empty_cache()

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import json

import torch.utils.data as data_utils

import pytorch_lightning as pl


DEFAULT_CONFIG = DotDict({
    'users': len(user_id_mapping),
    'movies': len(movie_id_mapping),

    'user_embedding_size': 128,
    'movie_embedding_size': 128,
    'metadata_size': 128,
    
    'hidden_size': 512
})

with open('config', 'w') as f:
    json.dump(DEFAULT_CONFIG, f)

class DeepRecommender(pl.LightningModule):
    
    def __init__(self, config):
        super(DeepRecommender, self).__init__()
        
        self.user_emb = nn.Embedding(config.users, config.user_embedding_size)
        self.movie_emb = nn.Embedding(config.movies, config.movie_embedding_size)
        
        self.dim_reduction = nn.Linear(768, config.metadata_size)
        
        self.output = nn.ModuleList([
            nn.Linear(config.user_embedding_size + config.movie_embedding_size + config.metadata_size, \
                                config.hidden_size),
            nn.Dropout(.2),
            nn.Linear(config.hidden_size, 1)
        ])
        
    def forward(self, user, movie, metadata):
        user_emb = self.user_emb(user)
        movie_emb = self.movie_emb(movie)
        metadata = self.dim_reduction(metadata)
        x = torch.cat([user_emb, movie_emb, metadata], dim=-1)
        
        for module in self.output:
            x = module(x)
        
        return x
        
    def training_step(self, batch, batch_nb):
        user, movie, metadata, score = batch
        user = user.cuda()
        movie = movie.cuda()
        metadata = metadata.cuda()
        score = score.cuda()
        y_hat = self.forward(user, movie, metadata)
        loss = F.mse_loss(y_hat, score.unsqueeze(1))
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_nb):
        # OPTIONAL
        user, movie, metadata, score = batch
#         print(user)
#         print(movie)
#         print(metadata)
#         print(score)
        user = user.cuda()
        movie = movie.cuda()
        metadata = metadata.cuda()
        score = score.cuda()
        y_hat = self.forward(user, movie, metadata)
        return {'val_loss': F.mse_loss(y_hat, score.unsqueeze(1))}

    def validation_end(self, outputs):
        # OPTIONAL
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

    def configure_optimizers(self):
        # REQUIRED
        # can return multiple optimizers and learning_rate schedulers
        # (LBFGS it is automatically supported, no need for closure function)
        return torch.optim.Adam(self.parameters())
    
    @pl.data_loader
    def train_dataloader(self):
        dataset = data_utils.TensorDataset(
            torch.LongTensor(df_hybrid_train['User'].values), 
            torch.LongTensor(df_hybrid_train['Movie'].values), 
            torch.FloatTensor(train_tfidf),
            torch.FloatTensor(df_hybrid_train['Rating'].values)
        )
        return data_utils.DataLoader(dataset, batch_size=32, shuffle=True)

    @pl.data_loader
    def val_dataloader(self):
        dataset = data_utils.TensorDataset(
            torch.LongTensor(df_hybrid_test['User'].values), 
            torch.LongTensor(df_hybrid_test['Movie'].values), 
            torch.FloatTensor(test_tfidf),
            torch.FloatTensor(df_hybrid_test['Rating'].values)
        )
        return data_utils.DataLoader(dataset, batch_size=32)

In [24]:
from pytorch_lightning import Trainer

recommender_model = DeepRecommender(DEFAULT_CONFIG).cuda()

trainer = Trainer(max_nb_epochs=100)    
trainer.fit(recommender_model)

INFO:root:            Name        Type Params
0       user_emb   Embedding   60 M
1      movie_emb   Embedding  575 K
2  dim_reduction      Linear   98 K
3         output  ModuleList  197 K
4       output.0      Linear  197 K
5       output.1     Dropout    0  
6       output.2      Linear  513  
Epoch 1:  94%|█████████▍| 46875/50000 [10:34<00:51, 60.32batch/s, batch_nb=46874, loss=1.116, v_nb=21]
Epoch 1:  94%|█████████▍| 47018/50000 [10:34<00:35, 84.65batch/s, batch_nb=46874, loss=1.116, v_nb=21]
Epoch 1:  94%|█████████▍| 47197/50000 [10:34<00:23, 118.51batch/s, batch_nb=46874, loss=1.116, v_nb=21]
Epoch 1:  95%|█████████▍| 47379/50000 [10:34<00:15, 164.70batch/s, batch_nb=46874, loss=1.116, v_nb=21]
Epoch 1:  95%|█████████▌| 47561/50000 [10:34<00:10, 226.46batch/s, batch_nb=46874, loss=1.116, v_nb=21]
Epoch 1:  95%|█████████▌| 47742/50000 [10:34<00:07, 306.97batch/s, batch_nb=46874, loss=1.116, v_nb=21]
Epoch 1:  96%|█████████▌| 47924/50000 [10:34<00:05, 408.93batch/s, batch_nb=4687

KeyboardInterrupt: 

In [26]:
torch.save(recommender_model.state_dict(), 'model.pt')