In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from scipy import linalg

In [2]:
ratings = pd.read_csv("../data/ratings_small.csv")

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movies_metadata = pd.read_csv("../data/movies_metadata.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [6]:
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [7]:
literal_eval(movies_metadata['genres'].loc[0])

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [8]:
def extract_genres(x):
    x = literal_eval(x)
    if isinstance(x, list):
        genre_names = [i['name'] for i in x]
        if len(genre_names) > 3:
            genre_names= genre_names[:3]
        return genre_names
    return []

In [9]:
movies_metadata['genres']= movies_metadata['genres'].apply(extract_genres)

In [10]:
movies_metadata['genres']

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
45461                 [Drama, Family]
45462                         [Drama]
45463       [Action, Drama, Thriller]
45464                              []
45465                              []
Name: genres, Length: 45466, dtype: object

In [11]:
movies = movies_metadata.drop(columns=['adult', 'belongs_to_collection', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'video',
       'vote_average', 'vote_count'])

In [12]:
movies.head(2)

Unnamed: 0,genres,id,title
0,"[Animation, Comedy, Family]",862,Toy Story
1,"[Adventure, Fantasy, Family]",8844,Jumanji


In [13]:
movies = movies.rename(columns={'id':'movieId'})

In [14]:
movies.head(3)

Unnamed: 0,genres,movieId,title
0,"[Animation, Comedy, Family]",862,Toy Story
1,"[Adventure, Fantasy, Family]",8844,Jumanji
2,"[Romance, Comedy]",15602,Grumpier Old Men


In [15]:
movies.shape

(45466, 3)

In [16]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [17]:
ratings.shape

(100004, 4)

In [18]:
ratings_matrix = np.ndarray(
    shape=(np.max(ratings.movieId.values), np.max(ratings.userId.values)),
    dtype=np.uint8)
ratings_matrix[ratings.movieId.values-1, ratings.userId.values-1] = ratings.rating.values

In [19]:
ratings_matrix

array([[0, 0, 0, ..., 0, 4, 5],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [20]:
ratings_matrix.shape

(163949, 671)

In [21]:
normalised_mat = ratings_matrix - np.asarray([(np.mean(ratings_matrix, 1))]).T
normalised_mat

array([[-1.38897168, -1.38897168, -1.38897168, ..., -1.38897168,
         2.61102832,  3.61102832],
       [-0.5290611 , -0.5290611 , -0.5290611 , ..., -0.5290611 ,
        -0.5290611 , -0.5290611 ],
       [-0.27123696, -0.27123696, -0.27123696, ..., -0.27123696,
        -0.27123696, -0.27123696],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00745156, -0.00745156, -0.00745156, ..., -0.00745156,
        -0.00745156, -0.00745156]])

In [22]:
normalised_mat.shape

(163949, 671)

In [23]:
A = normalised_mat.T / np.sqrt(ratings_matrix.shape[0] - 1)

In [24]:
U, S, V = linalg.svd(A, full_matrices=False)

array([[-0.02995889,  0.01304345,  0.01532281, ..., -0.01214882,
         0.00446713,  0.03860457],
       [-0.01449773,  0.02353186, -0.05329651, ..., -0.01078177,
        -0.04822652,  0.03860457],
       [-0.01786897, -0.00365745,  0.00323437, ...,  0.00127791,
         0.00211293,  0.03860457],
       ...,
       [-0.02412314,  0.0098662 ,  0.01326295, ...,  0.00391374,
        -0.00112149,  0.03860457],
       [-0.01963928,  0.00971444,  0.00273814, ...,  0.00333273,
        -0.00738326,  0.03860457],
       [ 0.01609025, -0.01900362,  0.00573239, ..., -0.00010602,
        -0.0029862 ,  0.03860457]])