# Recommender system

In [1]:
import csv
import numpy as np
from scipy.sparse import lil_matrix
from tqdm import tqdm
import math

### Data loader

Loading the data and representing it as a sparce utility matrix (review_data).

Note that indeces in the dataset are not sequential. That is why we create a separate functiona for translating dataset indeces into unitily matrix (review_data) indeces and back.

In [2]:
users_index = []
movies_index = []

num_users_processed = 0
num_movies_processed = 0

def get_user_index(dataset_user_id):
    global num_users_processed
    global users_index
    if dataset_user_id not in users_index:
        users_index.append(dataset_user_id)
        num_users_processed += 1
    return users_index.index(dataset_user_id)

def get_movie_index(dataset_movie_id):
    global num_movies_processed
    global movies_index
    if dataset_movie_id not in movies_index:
        movies_index.append(dataset_movie_id)
        num_movies_processed += 1
    return movies_index.index(dataset_movie_id)

In [3]:
num_movies = 9125
num_users = 671

review_data = lil_matrix((num_users, num_movies))

data_file = 'ratings.csv' 

with open(data_file) as ratings_file:
    rating_reader = csv.reader(ratings_file, delimiter=',')
    next(rating_reader)
    iter = 0

    for record in rating_reader:
        user_id, movie_id, rating, _ = record
        user_id = get_user_index(int(user_id))
        movie_id = get_movie_index(int(movie_id))
        rating = float(rating)
        review_data[user_id, movie_id] = rating
        iter += 1
        if iter % 10000 == 0: print('Processed ', iter, 'records out of 100k.')

Processed  10000 records out of 100k.
Processed  20000 records out of 100k.
Processed  30000 records out of 100k.
Processed  40000 records out of 100k.
Processed  50000 records out of 100k.
Processed  60000 records out of 100k.
Processed  70000 records out of 100k.
Processed  80000 records out of 100k.
Processed  90000 records out of 100k.
Processed  100000 records out of 100k.


In [4]:
movies = {}

data_file = 'movies.csv' 

with open(data_file) as movie_file:
    movie_reader = csv.reader(movie_file, delimiter=',')
    next(movie_reader)
    for record in movie_reader:
        movie_id, title, _ = record
        movies[int(movie_id)] = title

In [5]:
def get_movie_name_by_id(id):
    dataset_movie_id = movies_index[id]
    return movies[dataset_movie_id]

def get_movie_id_by_name(movie_name):
    movie_data_id = int(list(movies.keys())[list(movies.values()).index(movie_name)])
    return movies_index.index(movie_data_id)

In [6]:
print(get_movie_id_by_name('Matrix, The (1999)'))
print(get_movie_name_by_id(402))

402
Matrix, The (1999)


### Item-item collaborative filtering

Please compute item-item collaborative filtering from the utility matrix (review_data). The output of your computation should be pairwise similarities between all movies.

As the order of the pair does not change the similarity metric, you can represent it as a dictionary with use [frozensets](https://docs.python.org/2.4/lib/types-set.html) as its keys. For example:

similarity = {}

similarity[frozenset(movie1_id, movie2_id)] = 0.67

In [7]:
print(review_data)

  (0, 0)	2.5
  (0, 1)	3.0
  (0, 2)	3.0
  (0, 3)	2.0
  (0, 4)	4.0
  (0, 5)	2.0
  (0, 6)	2.0
  (0, 7)	2.0
  (0, 8)	3.5
  (0, 9)	2.0
  (0, 10)	2.5
  (0, 11)	1.0
  (0, 12)	4.0
  (0, 13)	4.0
  (0, 14)	3.0
  (0, 15)	2.0
  (0, 16)	2.0
  (0, 17)	2.5
  (0, 18)	1.0
  (0, 19)	3.0
  (1, 20)	4.0
  (1, 21)	5.0
  (1, 22)	5.0
  (1, 23)	4.0
  (1, 24)	4.0
  (1, 25)	3.0
  (1, 26)	3.0
  (1, 27)	4.0
  (1, 28)	3.0
  (1, 29)	5.0
  (1, 30)	4.0
  (1, 31)	3.0
  (1, 32)	3.0
  (1, 33)	3.0
  (1, 34)	3.0
  (1, 35)	3.0
  (1, 36)	3.0
  (1, 37)	5.0
  (1, 38)	1.0
  (1, 39)	3.0
  (1, 40)	3.0
  (1, 41)	3.0
  (1, 42)	4.0
  (1, 43)	4.0
  (1, 44)	5.0
  (1, 45)	5.0
  (1, 46)	3.0
  (1, 47)	4.0
  (1, 48)	3.0
  (1, 49)	4.0
  (1, 50)	3.0
  (1, 51)	4.0
  (1, 52)	2.0
  (1, 53)	1.0
  (1, 54)	3.0
  (1, 55)	4.0
  (1, 56)	4.0
  (1, 57)	3.0
  (1, 58)	3.0
  (1, 59)	3.0
  (1, 60)	3.0
  (1, 61)	2.0
  (1, 62)	3.0
  (1, 63)	3.0
  (1, 64)	3.0
  (1, 65)	3.0
  (1, 66)	2.0
  (1, 67)	3.0
  (1, 68)	4.0
  (1, 69)	3.0
  (1, 70)	4.0
  (1, 71)	2.0
  

In [8]:
adjusted_review_data = lil_matrix((num_users, num_movies))

# --------------- YOUR CODE HERE ---------------
mean_vote = sum([np.sum(np.sum(el.data)) for el in review_data])/review_data.count_nonzero()

for el, z in enumerate(review_data):
    for place in z.nonzero()[1]:
        adjusted_review_data[el, place] = review_data[el, place] - mean_vote

def do_similarity(data):
    sim = {}
    
    product = data.T.dot(data.toarray())
    normalize = np.array([np.sqrt(np.diagonal(product))])
    output = product/ normalize.T /normalize
    
    for idx in tqdm(range(len(output))):
        for movie_indx in range(len(output[idx])):
            if output[idx][movie_indx] and math.isnan(output[idx][movie_indx])!=True:
                sim[frozenset([idx, movie_indx])] = output[idx][movie_indx]
    return sim
# ---------------------------------------------



In [9]:
print(get_movie_name_by_id(0))

Dangerous Minds (1995)


### Finding most similar movies

Using your item-item similarity, find 5 movies you would recommend the someone who likes the following:
- Matrix, The (1999)
- Toy Story (1995)
- From Dusk Till Dawn (1996)
- Gone with the Wind (1939)
- Iron Man (2008)

In other words, find 5 most similar movies to each of the above using your similarity metric. You may find useful functions get_movie_name_by_id() and get_movie_id_by_name() here.

In [12]:
# --------------- YOUR CODE HERE ---------------
to_find_movies = ["Matrix, The (1999)", "Toy Story (1995)", "From Dusk Till Dawn (1996)", "Gone with the Wind (1939)", "Iron Man (2008)"]

    
def retrieve_top_5(initial_movies, sim):
    for in_movie in initial_movies:
        lst_results = list()
        numb = get_movie_id_by_name(in_movie)
        for el in sim.keys():
            if numb in list(el):
                lst_results.append([el, sim[el]])

        lst_results.sort(key=lambda x: x[1], reverse=True)
        top = lst_results[:6]

        print("Most similar movies to "+ '"'+get_movie_name_by_id(numb)+'"' + " :")
        for mv in top:
            if len(list(mv[0])) == 2:
                print(get_movie_name_by_id([x for x in list(mv[0]) if x != numb][0])  )
        print()

# ----------------------------------------------

# Here are two attitudes of recommender system design

1. Recomender System based on general similarity (general):
https://ashokharnal.wordpress.com/2014/12/18/worked-out-example-item-based-collaborative-filtering-for-recommenmder-engine/

2. Recomender System based on adjusted rating similarity (custom):
https://medium.com/@tomar.ankur287/item-item-collaborative-filtering-recommender-system-in-python-cf3c945fae1e

The main difference is that in the second example I substract the mean value from each score in the score matrix at the very beginning.

# 1. Recomender System based on general similarity

In [13]:
similarity = do_similarity(review_data)
retrieve_top_5(to_find_movies, similarity)

  from ipykernel import kernelapp as app
100%|██████████| 9125/9125 [00:59<00:00, 152.76it/s]


Most similar movies to "Matrix, The (1999)" :
Lord of the Rings: The Fellowship of the Ring, The (2001)
Lord of the Rings: The Two Towers, The (2002)
Fight Club (1999)
Back to the Future (1985)
Lord of the Rings: The Return of the King, The (2003)

Most similar movies to "Toy Story (1995)" :
Toy Story 2 (1999)
Star Wars: Episode IV - A New Hope (1977)
Forrest Gump (1994)
Independence Day (a.k.a. ID4) (1996)
Groundhog Day (1993)

Most similar movies to "From Dusk Till Dawn (1996)" :
Nightmare on Elm Street, A (1984)
Sleepy Hollow (1999)
Batman Returns (1992)
Candyman (1992)
Alien³ (a.k.a. Alien 3) (1992)

Most similar movies to "Gone with the Wind (1939)" :
Casablanca (1942)
It's a Wonderful Life (1946)
Wizard of Oz, The (1939)
African Queen, The (1951)
North by Northwest (1959)

Most similar movies to "Iron Man (2008)" :
Dark Knight, The (2008)
Star Trek (2009)
Batman Begins (2005)
Avatar (2009)
Avengers, The (2012)



# 2. Recomender System based on adjusted rating similarity

In [15]:
similarity = do_similarity(adjusted_review_data)
retrieve_top_5(to_find_movies, similarity)

  from ipykernel import kernelapp as app
100%|██████████| 9125/9125 [02:18<00:00, 65.84it/s]  


Most similar movies to "Matrix, The (1999)" :
Lord of the Rings: The Fellowship of the Ring, The (2001)
Fight Club (1999)
Lord of the Rings: The Two Towers, The (2002)
Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode IV - A New Hope (1977)

Most similar movies to "Toy Story (1995)" :
Toy Story 2 (1999)
Bug's Life, A (1998)
Monsters, Inc. (2001)
Lion King, The (1994)
Toy Story 3 (2010)

Most similar movies to "From Dusk Till Dawn (1996)" :
Half Baked (1998)
U Turn (1997)
8 Heads in a Duffel Bag (1997)
Never Been Kissed (1999)
Skulls, The (2000)

Most similar movies to "Gone with the Wind (1939)" :
All About Eve (1950)
Sophie's Choice (1982)
Five Easy Pieces (1970)
French Connection, The (1971)
Last Picture Show, The (1971)

Most similar movies to "Iron Man (2008)" :
Dark Knight, The (2008)
Avengers, The (2012)
Guardians of the Galaxy (2014)
How to Train Your Dragon (2010)
Captain America: The Winter Soldier (2014)

