In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    "movie_id":[1,2,3,4,5],
    "title":["Inception","Intersteller","Avatar","Titanic","The Dark Knight"],
    "genre":["Sci-Fi|Action|Thriller","Drama|Sci-Fi","Action|Sci-Fi|Adventure","Romance|Drama","Action|Drama|Crime"]
}

In [3]:
movies = pd.DataFrame(data)

In [4]:
movies

Unnamed: 0,movie_id,title,genre
0,1,Inception,Sci-Fi|Action|Thriller
1,2,Intersteller,Drama|Sci-Fi
2,3,Avatar,Action|Sci-Fi|Adventure
3,4,Titanic,Romance|Drama
4,5,The Dark Knight,Action|Drama|Crime


In [5]:
ratings_data = {
    "user_id":[1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4],
    "movie_id":[2,3,4,1,1,2,5,4,3,4,2,5,4,1,2,5],
    "rating":[4,5,3,2,2,5,2,4,5,3,1,3,4,3,5,4]
}

In [6]:
ratings = pd.DataFrame(ratings_data)

In [7]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,2,4
1,1,3,5
2,1,4,3
3,1,1,2
4,2,1,2
5,2,2,5
6,2,5,2
7,2,4,4
8,3,3,5
9,3,4,3


In [8]:
# Content-Based

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
vectorizer = CountVectorizer( tokenizer = lambda x:x.split('|') )

In [11]:
genre_matrix = vectorizer.fit_transform(movies['genre'])



In [12]:
genre_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 13 stored elements and shape (5, 7)>

In [13]:
cos_sim = cosine_similarity(genre_matrix)

In [14]:
cos_sim

array([[1.        , 0.40824829, 0.66666667, 0.        , 0.33333333],
       [0.40824829, 1.        , 0.40824829, 0.5       , 0.40824829],
       [0.66666667, 0.40824829, 1.        , 0.        , 0.33333333],
       [0.        , 0.5       , 0.        , 1.        , 0.40824829],
       [0.33333333, 0.40824829, 0.33333333, 0.40824829, 1.        ]])

In [15]:
genre_matrix.toarray()

array([[1, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 1, 0, 1, 0],
       [1, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 1, 0, 0],
       [1, 0, 1, 1, 0, 0, 0]])

In [16]:
vectorizer.get_feature_names_out()

array(['action', 'adventure', 'crime', 'drama', 'romance', 'sci-fi',
       'thriller'], dtype=object)

In [17]:
genre = pd.DataFrame(
    genre_matrix.toarray() , 
    columns = vectorizer.get_feature_names_out() , 
    index = movies['title']
)

In [18]:
genre

Unnamed: 0_level_0,action,adventure,crime,drama,romance,sci-fi,thriller
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Inception,1,0,0,0,0,1,1
Intersteller,0,0,0,1,0,1,0
Avatar,1,1,0,0,0,1,0
Titanic,0,0,0,1,1,0,0
The Dark Knight,1,0,1,1,0,0,0


In [19]:
cos = pd.DataFrame(
    cos_sim , 
    columns = movies['title'],
    index = movies['title']
)

In [20]:
cos

title,Inception,Intersteller,Avatar,Titanic,The Dark Knight
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Inception,1.0,0.408248,0.666667,0.0,0.333333
Intersteller,0.408248,1.0,0.408248,0.5,0.408248
Avatar,0.666667,0.408248,1.0,0.0,0.333333
Titanic,0.0,0.5,0.0,1.0,0.408248
The Dark Knight,0.333333,0.408248,0.333333,0.408248,1.0


In [21]:
def recommend_by_genre(movie_title , top_n = 2):
    idx = movies[movies['title']==movie_title].index[0]
    sim_score = list( enumerate( cos_sim[idx] ) )
    sim_score = sorted(sim_score , key=lambda x:x[1] , reverse=True) 
    sim_score = sim_score[1:top_n+1]
    return movies.iloc[ [i[0] for i in sim_score] ][['title','genre']]

In [22]:
recommend_by_genre('Avatar')

Unnamed: 0,title,genre
0,Inception,Sci-Fi|Action|Thriller
1,Intersteller,Drama|Sci-Fi


### Collaborative Filtering

In [23]:
!python -m pip install --upgrade pip setuptools wheel
!pip install scikit-surprise

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


  error: subprocess-exited-with-error
  
  Getting requirements to build wheel did not run successfully.
  exit code: 1
  
  [44 lines of output]
  
  Error compiling Cython file:
  ------------------------------------------------------------
  ...
          self.avg_cltr_i = avg_cltr_i
          self.avg_cocltr = avg_cocltr
  
          return self
  
      def compute_averages(self, np.ndarray[np.int_t] cltr_u,
                                               ^
  ------------------------------------------------------------
  surprise\prediction_algorithms\co_clustering.pyx:157:45: Invalid type.
  Compiling surprise/similarities.pyx because it changed.
  Compiling surprise/prediction_algorithms/matrix_factorization.pyx because it changed.
  Compiling surprise/prediction_algorithms/optimize_baselines.pyx because it changed.
  Compiling surprise/prediction_algorithms/slope_one.pyx because it changed.
  Compiling surprise/prediction_algorithms/co_clustering.pyx because it changed.
  [1/5] 

In [24]:
from surprise import Dataset 

ModuleNotFoundError: No module named 'surprise'