In [17]:
%matplotlib inline
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
DATA_SET_NAME = 'ml-20m'
DATA_PATH = './data'

In [18]:
from urllib.request import urlretrieve
from tqdm import tqdm
import zipfile

class DLProgress(tqdm):
    """
    Handle Progress Bar while Downloading
    """
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        """
        A hook function that will be called once on establishment of the network connection and
        once after each block read thereafter.
        """
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

        
def download_extract():
    """
    Download and extract database
    """
    url = 'http://files.grouplens.org/datasets/movielens/' + DATA_SET_NAME + '.zip'
    
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_PATH)
    
    file_path = os.path.join(DATA_PATH, DATA_SET_NAME + '.zip')
    
    # download data:
    if not os.path.exists(file_path):
        with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Downloading ml-20m.zip') as pbar:
            urlretrieve(
                url,
                file_path,
                pbar.hook)
    else:
        return
    
    print('Extracting data...')
    with zipfile.ZipFile(file_path) as zf:
        zf.extractall(DATA_PATH)

    print('Done.')

In [19]:
print('movies.csv: ')
movies = pd.read_csv(os.path.join(DATA_PATH, DATA_SET_NAME,'movies.csv'),index_col=None)
movies.describe()
movies.head(5)

movies.csv: 


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
print('ratings.csv: ')
ratings = pd.read_csv(os.path.join(DATA_PATH, DATA_SET_NAME,'ratings.csv'),index_col=None)
ratings.describe()
ratings.head(5)

ratings.csv: 


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [21]:
print('tags.csv: ')
tags = pd.read_csv(os.path.join(DATA_PATH, DATA_SET_NAME,'tags.csv'),index_col=None)
tags.describe()
tags.head(5)

tags.csv: 


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [22]:
print('genome-tags.csv: ')
genome_tags = pd.read_csv(os.path.join(DATA_PATH, DATA_SET_NAME,'genome-tags.csv'),index_col=None)
genome_tags.describe()
genome_tags.head(5)

genome-tags.csv: 


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [23]:
print('genome-scores.csv: ')
genome_scores = pd.read_csv(os.path.join(DATA_PATH, DATA_SET_NAME,'genome-scores.csv'),index_col=None)
genome_scores.describe()
genome_scores.head(5)

genome-scores.csv: 


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


    # Statistique simple sur les données

In [24]:
print('The number of movies: {}'.format(movies.count()['movieId']))
print('The number of ratings: {}'.format(ratings.count()['movieId']))

print('')
print('min value of rating: {}'.format(ratings['rating'].min()))
print('max value of rating: {}'.format(ratings['rating'].max()))

print('')
ra = ratings.groupby(ratings['userId']).count()
print('The number of user in ratings.csv: {}'.format(ra.count()[0]))
print('The minimum number of ratings per user in ratings.csv: {}'.format(ra['movieId'].min()))
print('The maximun number of ratings per user in ratings.csv: {}'.format(ra['movieId'].max()))

print('')
ra = ratings.groupby(ratings['movieId']).count()
print('The number of movies in ratings.csv: {}'.format(ra.count()[0]))
print('The minimum number of ratings per movie in ratings.csv: {}'.format(ra['userId'].min()))
print('The maximun number of ratings per movie in ratings.csv: {}'.format(ra['userId'].max()))

The number of movies: 27278
The number of ratings: 20000263

min value of rating: 0.5
max value of rating: 5.0

The number of user in ratings.csv: 138493
The minimum number of ratings per user in ratings.csv: 20
The maximun number of ratings per user in ratings.csv: 9254

The number of movies in ratings.csv: 26744
The minimum number of ratings per movie in ratings.csv: 1
The maximun number of ratings per movie in ratings.csv: 67310


In [25]:
print('The number of tags in tags.csv: {}'.format(tags.count()['userId']))
print('The number of tags in genome-tags.csv: {}'.format(genome_tags.count()['tagId']))

print('')
ra = tags.groupby(tags['userId']).count()
print('The number of user in tags.csv: {}'.format(ra.count()[0]))
print('The minimum number of tags per user in tags.csv: {}'.format(ra['movieId'].min()))
print('The maximun number of tags per user in tags.csv: {}'.format(ra['movieId'].max()))

print('')
ra = tags.groupby(tags['movieId']).count()
print('The number of movies in tags.csv: {}'.format(ra.count()[0]))
print('The minimum number of tags per movie in tags.csv: {}'.format(ra['userId'].min()))
print('The maximun number of tags per movie in tags.csv: {}'.format(ra['userId'].max()))

print('')
tags_mer = pd.merge(tags, genome_tags, how='left', left_on='tag', right_on='tag')
print('The number of tags in tags.csv but not in genome-tags.csv: {}'.format(tags_mer[(tags_mer['tagId'].isnull())].count()[0]))

The number of tags in tags.csv: 465564
The number of tags in genome-tags.csv: 1128

The number of user in tags.csv: 7801
The minimum number of tags per user in tags.csv: 1
The maximun number of tags per user in tags.csv: 20356

The number of movies in tags.csv: 19545
The minimum number of tags per movie in tags.csv: 1
The maximun number of tags per movie in tags.csv: 1994

The number of tags in tags.csv but not in genome-tags.csv: 247993


In [26]:
print('The length of genome_scores.csv: {}'.format(genome_scores.count()['movieId']))
print('max value of relevance from genome_scores.csv: {}'.format(genome_scores['relevance'].max()))
print('min value of relevance from genome_scores.csv: {}'.format(genome_scores['relevance'].min()))

print('')
ra = genome_scores.groupby(genome_scores['movieId']).count()
print('The number of movies in genome_scores.csv: {}'.format(ra.count()[0]))
print('The minimum number of tags per movie in genome_scores.csv: {}'.format(ra['tagId'].min()))
print('The maximun number of tags per movie in genome_scores.csv: {}'.format(ra['tagId'].max()))

The length of genome_scores.csv: 11709768
max value of relevance from genome_scores.csv: 1.0
min value of relevance from genome_scores.csv: 0.00024999999999997247

The number of movies in genome_scores.csv: 10381
The minimum number of tags per movie in genome_scores.csv: 1128
The maximun number of tags per movie in genome_scores.csv: 1128


In [11]:
# Analysis the relevant data of movies in both genome_scores.csv and ratings.csv:

genome_scores_group = genome_scores.groupby(genome_scores['movieId']).mean()
ratings_group = ratings.groupby(ratings['movieId']).mean()
rat_ge_merge = pd.merge(ratings_group, genome_scores_group, how='inner', left_on='movieId', right_on='movieId')
number = rat_ge_merge.count()[0]
print('Number of movies in both genome_scores.csv and ratings.csv: {}. Take up {}% of ratings.csv'\
      .format(number, round(number/19545*100)))

ratings_genome_merge = pd.merge(ratings, genome_scores_group, how='inner', left_on='movieId', right_on='movieId')
number = ratings_genome_merge.count()[0]
print('Number of ratings where its movieId in genome_scores.csv: {}. Take up {}% of ratings.csv'\
      .format(number, round(number/20000263*100)))

print('')
ra = ratings_genome_merge.groupby(ratings_genome_merge['userId']).count()
number = ra.count()[0]
print('{} users rate the movies appearing in both genome_scores.csv and ratings.csv. Take up {}% of ratings.csv'\
      .format(number, round(number/138493*100)))
print('Minimum number of ratings per user for the movies appearing in both genome_scores.csv and ratings.csv: {}'.format(ra['movieId'].min()))

Number of movies in both genome_scores.csv and ratings.csv: 10370. Take up 53.0% of ratings.csv
Number of ratings where its movieId in genome_scores.csv: 19800443. Take up 99.0% of ratings.csv

138493 users rate the movies appearing in both genome_scores.csv and ratings.csv. Take up 100.0% of ratings.csv
Minimum number of ratings per user for the movies appearing in both genome_scores.csv and ratings.csv: 13


# Hypotheses et model de l'apprentissage automatique

## Hypothèse

1-Les balises de genome-tags.csv constituent l'ensemble complet des espaces vectoriels de balises. Les autres balises ne figurant pas dans génome-tags.csv sont des combinaisons linéaires de balises dans le génome-tags.csv.
2-La fonctionnalité des films peut être parfaitement représentée par des balises dans genome-tags.csv, telles que le vecteur de pertinence dans genome_scores.csv.
Le vecteur de pertinence dans genome_scores.csv est correct et peut représenter la fonctionnalité de films.
3- Ignorer la qualité des films.
4-Nous ne pouvons pas obtenir d'autres informations sur les films en dehors de l'ensemble de données. Donc, nous n'utilisons pas links.csv.
5- La durée de sortie des films n’affecte pas.
6- L'horodatage dans ratings.csv: et tags.csv n'affecte pas.

# Creation de données d'entrainement 

In [28]:
# Preprocess data (You should run last code cell to get 'ratings_genome_merge')
# The first column of features is userId, the next is movieId.
# The only one column of target is rating.

remove_fields = ['timestamp','tagId','relevance','rating']
target = ratings_genome_merge['rating']
feature = ratings_genome_merge.drop(remove_fields, axis=1)
features = feature.values
target = target.values

genome_scores_dict = {}
for i in range(10381):
    m_id = -1
    vec = []
    for j in range(1128):
        index = j + i * 1128
        if m_id < 0:
            m_id = genome_scores['movieId'][index]
        assert genome_scores['movieId'][index] == m_id
        assert genome_scores['tagId'][index] == j + 1
        vec.append(genome_scores['relevance'][index])
    genome_scores_dict[str(m_id)] = vec

In [29]:
# Actually, using train_test_split in here is not best. 
# The better method should split the data according the userId, which make sure every user is in the test set.
# But here, let us make it easier and quickly ( We have already include 99.86% users).
from sklearn.model_selection import train_test_split
train_features,test_features, train_target, test_target = train_test_split(features,  
                                                           target,  
                                                           test_size = 0.2,  
                                                           random_state = 0)

dict_t = {}
dict_t['userId'] = test_features[:,0]
dict_t['movieId'] = test_features[:,1]
pd_data = pd.DataFrame.from_dict(dict_t)
user_test = pd_data.groupby(pd_data['userId']).count().count()[0]

print('{}% users in test set ({} users)'.format(round(user_test/138493*100, 2), user_test ))

dict_t = {}
dict_t['userId'] = train_features[:,0]
dict_t['movieId'] = train_features[:,1]
pd_data = pd.DataFrame.from_dict(dict_t)
user_train = pd_data.groupby(pd_data['userId']).count().count()[0]

print('{}% users in training set ({} users)'.format(round(user_train/138493*100, 2), user_train ))

99.86% users in test set (138294 users)
100.0% users in training set (138493 users)


In [30]:
# Save preprocess data to './data/verify_assumption.data'
pickle.dump((train_features, test_features, train_target, test_target, genome_scores_dict), open('./data/verify_assumption.data', 'wb'))

In [31]:
# Load preprocess data from './data/verify_assumption.data'
train_features, test_features, train_target, test_target, genome_scores_dict = pickle.load(open('./data/verify_assumption.data', mode='rb'))

# les paramettres du model


In [32]:
batch_size = 768  # batch size "taille du lot"
lr = 1e-3         # learning rate "taux d'apprentissage"
feature_dim = 512 # Dimension of movie or user feature vector 'Dimension du film ou du vecteur de fonctionnalités utilisateur'
Epoch = 6         # train epoch "époque du train"

# Model

In [33]:
class Verify_Assumption_Model(nn.Module):
    """The whole model"""
    def __init__(self):
        super(Verify_Assumption_Model, self).__init__()
        self.emb_user = nn.Embedding(138493 + 1, 512, # use ratings['userId'].max()+1 instead of 138493+1 is better
                            padding_idx=0)
        
        self.movie_transfrom = nn.Sequential(
            nn.Linear(1128, 512),
            nn.Tanh(), # activation function can not be the final layer of Sequential. But it can be the first one.
            nn.Linear(512, 512)
        )
    
    def forward(self, userId, movieVector):
        v_user  = self.emb_user(userId)
        v_movie = self.movie_transfrom(movieVector)
        v_user.unsqueeze_(1)
        v_movie.unsqueeze_(2)
        return torch.bmm(v_user,v_movie)

In [34]:
len_train_features = len(train_features)
index = 0
model = Verify_Assumption_Model()
model.cuda()

loss_fn = torch.nn.MSELoss(reduce=False, size_average=False)
opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                               lr=lr, weight_decay=0)
losses = {'train':[], 'test':[]}

for epoch_i in range(Epoch):
    index = 0
    while index <= len_train_features:
        index_end = index + batch_size
        if index_end >= len_train_features:
            batch_train = train_features[index:len_train_features]
            batch_train_target = train_target[index:len_train_features]
        else:
            batch_train = train_features[index:index_end]
            batch_train_target = train_target[index:index_end]

        #assert len(batch_train) == len(batch_train_target)

        userId = batch_train[:,0]
        movieId = batch_train[:,1]
        movie_vec = []
        for i in range(len(movieId)):
            movie_vec.append(genome_scores_dict[str(movieId[i])])


        rating = model(torch.tensor(userId, requires_grad = False).cuda(),torch.tensor(movie_vec, requires_grad = False).cuda())
        rating = rating.squeeze_(1).squeeze_(1)
        loss = sum(loss_fn(rating,torch.tensor(batch_train_target,dtype=torch.float32,requires_grad = False).cuda()))

        opt.zero_grad()
        loss.backward()
        losses['train'].append(loss.detach().cpu().numpy())
        opt.step()
        if len(losses['train']) % 500 == 0:
            print('Epoch {:>3} Batch {:>4}/15840354   train_loss = {:.3f}'.format(
                        epoch_i,
                        index,
                        losses['train'][len(losses['train'])-1]))
        index += batch_size
        
    #############################test#############################
    
    len_test_features = len(test_features)
    index = 0

    while index <= len_test_features:
        index_end = index + batch_size
        if index_end >= len_train_features:
            batch_train = test_features[index:len_train_features]
            batch_train_target = test_target[index:len_train_features]
        else:
            batch_train = test_features[index:index_end]
            batch_train_target = test_target[index:index_end]

        #assert len(batch_train) == len(batch_train_target)

        userId = batch_train[:,0]
        movieId = batch_train[:,1]
        movie_vec = []
        for i in range(len(movieId)):
            movie_vec.append(genome_scores_dict[str(movieId[i])])


        rating = model(torch.tensor(userId, requires_grad = False).cuda(),torch.tensor(movie_vec, requires_grad = False).cuda())
        rating = rating.squeeze_(1).squeeze_(1)
        loss = sum(loss_fn(rating,torch.tensor(batch_train_target,dtype=torch.float32,requires_grad = False).cuda()))

        losses['test'].append(loss.detach().cpu().numpy())
        if len(losses['test']) % 500 == 0:
            print('Epoch {:>3} Batch {:>4}/3960089   test_loss = {:.3f}'.format(
                        epoch_i,
                        index,
                        losses['test'][len(losses['test'])-1]))
        index += batch_size

RuntimeError: cuda runtime error (38) : no CUDA-capable device is detected at /opt/conda/conda-bld/pytorch_1556653183467/work/aten/src/THC/THCGeneral.cpp:51