In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip
!ls

In [None]:
! pip install scikit-surprise

In [3]:
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from collections import defaultdict
from operator import itemgetter
import heapq
import os
import csv

In [4]:
import pandas as pd

In [5]:
rating_df = pd.read_csv('ml-latest-small/ratings.csv', encoding='ISO-8859-1')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies_df = pd.read_csv('ml-latest-small/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movieID_to_name = (movies_df
      .loc[:, ['movieId', 'title']]
      .drop_duplicates()
      .set_index('movieId')
      .to_dict()['title'])

In [8]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
ratings_dataset = Dataset.load_from_file('ml-latest-small/ratings.csv', reader=reader)

In [9]:
trainset = ratings_dataset.build_full_trainset()

In [10]:
similarity_matrix = KNNBasic(sim_options={
        'name': 'cosine',
        'user_based': False
        })\
        .fit(trainset)\
        .compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [11]:
def getMovieName(movieID):
  if int(movieID) in movieID_to_name:
    return movieID_to_name[int(movieID)]
  else:
      return ""

In [16]:
def suggest_recommendations(test_subject: int):
  test_subject = str(test_subject)
  k = 20
  test_subject_iid = trainset.to_inner_uid(test_subject)
  test_subject_ratings = trainset.ur[test_subject_iid]
  k_neighbors = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])
  candidates = defaultdict(float)

  for itemID, rating in k_neighbors:
      try:
        similaritities = similarity_matrix[itemID]
        for innerID, score in enumerate(similaritities):
            candidates[innerID] += score * (rating / 5.0)
      except:
        continue

  watched = {}
  for itemID, rating in trainset.ur[test_subject_iid]:
    watched[itemID] = 1

  recommendations = []

  position = 0
  for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
      recommendations.append(getMovieName(trainset.to_raw_iid(itemID)))
      position += 1
      if (position > 10): break

  for rec in recommendations:
    print("Movie: ", rec)

In [17]:
suggest_recommendations(1)

[(3, 5.0), (4, 5.0), (6, 5.0), (8, 5.0), (9, 5.0), (10, 5.0), (11, 5.0), (13, 5.0), (15, 5.0), (18, 5.0), (21, 5.0), (25, 5.0), (28, 5.0), (31, 5.0), (35, 5.0), (36, 5.0), (38, 5.0), (44, 5.0), (45, 5.0), (46, 5.0)]
Movie:  Hang 'Em High (1968)
Movie:  Moonlight Mile (2002)
Movie:  Meet John Doe (1941)
Movie:  Audition (Ôdishon) (1999)
Movie:  Other Boleyn Girl, The (2008)
Movie:  John Adams (2008)
Movie:  Barbarian Invasions, The (Les invasions barbares) (2003)
Movie:  Wind That Shakes the Barley, The (2006)
Movie:  Spider (2002)
Movie:  Man Without a Past, The (Mies vailla menneisyyttä) (2002)
Movie:  Forgotten, The (2004)
