## Experiment script on SVD (Singular value decomposition algorithm)
1. In normalisation step, SVD consumes a lot of memory to process only 100k table
2. Evaluation step is quite hard to conduct, since the data is outdateda and we don't have benchmark to compare


In [1]:
import numpy as np
import pandas as pd


In [3]:
# 1. load datasets
rating_data = pd.io.parsers.read_csv('../sample_dataset/ratings.dat',
                              names=['user_id', 'movie_id', 'rating', 'time'],
                              engine='python', delimiter='::')
movie_data = pd.io.parsers.read_csv('../sample_dataset/movies.dat',
                                   names=['movie_id', 'title', 'genre'],
                                   engine='python', delimiter='::')
user_data = pd.io.parsers.read_csv('../sample_dataset/users.dat',
                                  names=['user_id','twitter_id'],
                                  engine='python', delimiter='::')

## 1.1 check the results
rating_data.info()
movie_data.info()
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
user_id     100000 non-null int64
movie_id    100000 non-null int64
rating      100000 non-null int64
time        100000 non-null int64
dtypes: int64(4)
memory usage: 3.1 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 3 columns):
movie_id    10506 non-null object
title       0 non-null float64
genre       0 non-null float64
dtypes: float64(2), object(1)
memory usage: 246.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16554 entries, 0 to 16553
Data columns (total 2 columns):
user_id       16554 non-null int64
twitter_id    16554 non-null int64
dtypes: int64(2)
memory usage: 258.7 KB


In [7]:
# 2. Reshape datasets for SVD ingestion

## 2.1 Relationship matrix with rows as movies, columns as users, and rating as cells
rating_mat = np.ndarray(shape = (np.max(rating_data.movie_id.values), np.max(rating_data.user_id.values)),
                      dtype = np.uint8)

rating_mat[rating_data.movie_id.values-1, rating_data.user_id.values-1] = rating_data.rating.values


In [9]:
## 2.2 Normalise the matrix
tmp = np.mean(rating_mat, 1)
rating_mat_mean = np.asarray([(rating_mat_mean)]).T
normalised_mat = rating_mat - rating_mat_mean


In [None]:
# 3. Compute SVD
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U,S,V = np.linalg.svd(A)


In [None]:
## 3.1 Similarity func
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1 in the dataset
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

## 3.2 Print results
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])


In [None]:
# 4. Execute
## 4.1 Set constant variables
k = 50
movie_id = 10 # (getting an id from movies.dat)
top_n = 10
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)

## 4.2 Call main func
print_similar_movies(movie_data, movie_id, indexes)
