<a href="https://colab.research.google.com/github/mcucii/ML-projects/blob/main/Movie%20Recommender%20Systems/Movie_Recommender_Systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import MultiLabelBinarizer


# https://www.kaggle.com/code/rounakbanik/movie-recommender-systems/notebook

In [93]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [94]:
movie_dir = '/content/drive/MyDrive/ML_projects/Movie Recommender Systems/movie_dataset'

In [95]:
os.listdir(movie_dir)

['credits.csv',
 'links_small.csv',
 'keywords.csv',
 'movies_metadata.csv',
 'links.csv',
 'ratings.csv',
 'ratings_small.csv']



## SIMPLE RECOMMENDER
- generalized recommendations based on popularity
- not customized



In [96]:
md = pd.read_csv(os.path.join(movie_dir, 'movies_metadata.csv'))

  md = pd.read_csv(os.path.join(movie_dir, 'movies_metadata.csv'))


In [97]:
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [98]:
from ast import literal_eval
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x : [i['name'] for i in x] if isinstance(x, list) else [])

In [99]:
md['genres'].head()

Unnamed: 0,genres
0,"[Animation, Comedy, Family]"
1,"[Adventure, Fantasy, Family]"
2,"[Romance, Comedy]"
3,"[Comedy, Drama, Romance]"
4,[Comedy]



*IMDB formula for calculating top movies:*

$ WR = \frac{v}{m+v} * R + \frac{m}{v+m} * C$

- R = average rating for the movie (mean)
- C = the mean vote across the whole report
- v = number of votes for the movie
- m = min. votes required to be listed on top 250 -> more votes than 95% of the movies




In [100]:
# taking out nulls
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

np.float64(5.244896612406511)

In [101]:
m = vote_counts.quantile(0.95)   # 95 percentile
m

np.float64(434.0)

In [102]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x : str(x).split('-')[0] if x != np.nan else np.nan)

In [103]:
qualified = md[(md['vote_count'] != np.nan) & (md['vote_count'] >= m) & (md['vote_average'] != np.nan)][['title', 'year', 'vote_count', 'vote_average', 'genres', 'popularity']]
qualified['vote_count']= qualified['vote_count'].astype('int')
qualified['vote_average']= qualified['vote_average'].astype('int')


In [104]:
qualified.shape

(2274, 6)

In [105]:
def imdb_rating(x):
  v = x['vote_count']
  R = x['vote_average']
  return R*(v/(v+m)) + C*(m/(v+m))

In [106]:
qualified[qualified['title'] == 'Fight Club']

Unnamed: 0,title,year,vote_count,vote_average,genres,popularity
2843,Fight Club,1999,9678,8,[Drama],63.869599


In [107]:
qualified['imdb_rating'] = qualified.apply(imdb_rating, axis=1)

In [108]:
qualified[qualified['title'] == 'Fight Club']

Unnamed: 0,title,year,vote_count,vote_average,genres,popularity,imdb_rating
2843,Fight Club,1999,9678,8,[Drama],63.869599,7.881753


In [109]:
qualified = qualified.sort_values('imdb_rating', ascending=False).head(250)

In [110]:
# TOP MOVIES:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,genres,popularity,imdb_rating
15480,Inception,2010,14075,8,"[Action, Thriller, Science Fiction, Mystery, A...",29.108149,7.917588
12481,The Dark Knight,2008,12269,8,"[Drama, Action, Crime, Thriller]",123.167259,7.905871
22879,Interstellar,2014,11187,8,"[Adventure, Drama, Science Fiction]",32.213481,7.897107
2843,Fight Club,1999,9678,8,[Drama],63.869599,7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,"[Adventure, Fantasy, Action]",32.070725,7.871787
292,Pulp Fiction,1994,8670,8,"[Thriller, Crime]",140.950236,7.86866
314,The Shawshank Redemption,1994,8358,8,"[Drama, Crime]",51.645403,7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,"[Adventure, Fantasy, Action]",29.324358,7.861927
351,Forrest Gump,1994,8147,8,"[Comedy, Drama, Romance]",48.307194,7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,"[Adventure, Fantasy, Action]",29.423537,7.851924


- Now, let's build charts for each genre

In [111]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name='genre'

In [112]:
s.head()

Unnamed: 0,genre
0,Animation
0,Comedy
0,Family
1,Adventure
1,Fantasy


In [113]:
g_md = md.drop('genres', axis=1).join(s)
#g_md.head()

In [114]:
def chart_by_genre(genre, percentile=0.85):
  df = g_md[g_md['genre'] == genre]
  vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
  vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
  C = vote_averages.mean()
  m = vote_counts.quantile(percentile)

  qualified = df[(df['vote_average'].notnull()) & (df['vote_count'].notnull()) & (df['vote_count']>=m)][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
  qualified['vote_count'] = qualified['vote_count'].astype('int')
  qualified['vote_average'] = qualified['vote_average'].astype('int')

  qualified['rating'] = qualified.apply(lambda x : ((x['vote_count']/(x['vote_count'] + m))*x['vote_average'] + (m/(x['vote_count']+m))*C), axis=1)
  qualified = qualified.sort_values('rating', ascending=False).head(250)

  return qualified

In [115]:
top_romance_movies = chart_by_genre('Romance')
top_romance_movies.head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,rating
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.565285
351,Forrest Gump,1994,8147,8,48.307194,7.971357
876,Vertigo,1958,1162,8,18.20822,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.845107,7.745154
1132,Cinema Paradiso,1988,834,8,14.177005,7.744878
19901,Paperman,2012,734,8,7.198633,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.994281,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


## CONTENT BASED RECOMMENDER

In [116]:
# using smaller dataset because of the limited computer power
sm = pd.read_csv(os.path.join(movie_dir, 'links_small.csv'))
sm.shape

(9125, 3)

In [117]:
sm.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [118]:
sm = sm[sm['tmdbId'].notnull()]['tmdbId']
sm.head()

Unnamed: 0,tmdbId
0,862.0
1,8844.0
2,15602.0
3,31357.0
4,11862.0


In [119]:
md.shape

(45466, 25)

In [120]:
md = md[md['id'].apply(lambda x: str(x).isdigit())].copy()
md['id'] = md['id'].astype(int)
md.shape

(45463, 25)

In [121]:
md['id'].head()

Unnamed: 0,id
0,862
1,8844
2,15602
3,31357
4,11862


In [122]:
smd = md[md['id'].isin(sm)].copy()
smd.shape

(9099, 25)

#### 1. Recommender based on overview and tagline

In [123]:
smd['tagline'] = smd['tagline'].fillna('')
smd['overview'] = smd['overview'].fillna('')
smd['description'] = smd['tagline'] + smd['overview']
smd['description'] = smd['description'].fillna('')
# types = []
# for x in md['tagline']:
#    if type(x) not in types:
#     types.append(type(x))
# print(types)


In [124]:
smd['description']

Unnamed: 0,description
0,"Led by Woody, Andy's toys live happily in his ..."
1,Roll the dice and unleash the excitement!When ...
2,Still Yelling. Still Fighting. Still Ready for...
3,Friends are the people who let you be yourself...
4,Just When His World Is Back To Normal... He's ...
...,...
40224,A god incarnate. A city doomed.From the mind b...
40503,The band you know. The story you don't.The ban...
44821,PokÃ©mon: Spell of the UnknownWhen Molly Hale's...
44826,"All your favorite PokÃ©mon characters are back,..."


In [125]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

- TF-IDF normalizes vectors
- The linear kernel computes the dot product -> When vectors are normalized, the dot product approximates their cosine similarity
- Because of this, linear_kernel typically produces similar results to cosine_similarity, but it is more efficient for large matrices!


In [126]:
tfidf_matrix.shape

(9099, 269262)

In [127]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# result: NxN matrix, where (i,j) represents the similarity between i-th and j-th movie
cosine_sim

array([[1.        , 0.00680204, 0.        , ..., 0.        , 0.00344826,
        0.        ],
       [0.00680204, 1.        , 0.01537897, ..., 0.00356808, 0.00762316,
        0.        ],
       [0.        , 0.01537897, 1.        , ..., 0.        , 0.00288257,
        0.00473726],
       ...,
       [0.        , 0.00356808, 0.        , ..., 1.        , 0.07824314,
        0.        ],
       [0.00344826, 0.00762316, 0.00288257, ..., 0.07824314, 1.        ,
        0.        ],
       [0.        , 0.        , 0.00473726, ..., 0.        , 0.        ,
        1.        ]])

In [128]:
smd = smd.reset_index()

In [129]:
titles = smd['title']
indices = pd.Series(smd.index, titles)

- Now, for every movie, we want to list (recommend) 10 more movies that one may like

In [130]:
def recommendN(title, cosine_sim, titles, indices, N=10):
  index = indices[title]
  recommendations = sorted(enumerate(cosine_sim[index]), key=lambda x : x[1], reverse=True)[1:N+1] # 10 most similar
  rec_indices = [i[0] for i in recommendations]
  return titles.iloc[rec_indices]

In [131]:
recommendN('The Godfather', cosine_sim, titles, indices, 10)

Unnamed: 0,title
973,The Godfather: Part II
8387,The Family
3509,Made
4196,Johnny Dangerously
5667,Fury
29,Shanghai Triad
2412,American Movie
1582,The Godfather: Part III
2159,Summer of Sam
4221,8 Women


## Recommender based on genre, keywords, cast and crew

In [132]:
credits = pd.read_csv(os.path.join(movie_dir, 'credits.csv'))
keywords = pd.read_csv(os.path.join(movie_dir, 'keywords.csv'))

In [133]:
#keywords.head()
#credits.head()

In [134]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [135]:
md.shape

(45463, 25)

In [136]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [137]:
smd = md[md['id'].isin(sm)].copy()
smd.shape

(9219, 28)

1. CREW -> we will take into consideration only DIRECTOR
2. CAST -> we will take into consideration only TOP 3 actors that appear on the list

In [138]:
smd['crew'] = smd['crew'].apply(literal_eval)
smd['cast'] = smd['cast'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)

In [139]:
smd['cast_size'] = smd['cast'].apply(lambda x : len(x))
smd['crew_size'] = smd['crew'].apply(lambda x : len(x))

In [140]:
#smd['crew'][0]

In [141]:
# x - cast list
def get_director(x):
  for c in x:
    if c['job'] ==  'Director':
      return c['name']
  return np.nan
#get_director(smd['crew'][0])

In [142]:
smd['director'] = smd['crew'].apply(get_director)

In [143]:
smd['cast'] = smd['cast'].apply(lambda x : [i['name'] for i in x] if isinstance(x,list) else [])
smd['cast'] = smd['cast'].apply(lambda x : x[:3] if len(x) >=3 else x)

smd['keywords'] = smd['keywords'].apply(lambda x : [i['name'] for i in x] if isinstance(x,list) else [])

In [144]:
# stripping spaces so name and surname are "a whole"
smd['cast'] = smd['cast'].apply(lambda x : [str.lower(i.replace(" ","")) for i in x])

smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x,x])      # repeating director-s name 3 times to give it more "weight"

# smd['director'] = [str(item).lower().replace(" ", "") for item in smd['director']]
# smd['cast'] = [[str(item).lower().replace(" ", "") for item in sublist] for sublist in smd['cast']]

In [145]:
s = smd.apply(lambda x : pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [146]:
s = s.value_counts()
s[:5]

Unnamed: 0_level_0,count
keyword,Unnamed: 1_level_1
independent film,610
woman director,550
murder,399
duringcreditsstinger,327
based on novel,318


In [147]:
s = s[s>1]

In [148]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [149]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)

STEMMING -> the process of reducing inflected (or sometimes derived) words to their word stem

In [150]:
stemmer = SnowballStemmer('english')
#stemmer.stem('commentator')

In [151]:
smd['keywords'] = smd['keywords'].apply(lambda x : [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x : [str.lower(i.replace(" ", "")) for i in x])

In [152]:
smd['alldata'] = smd['director'] + smd['cast'] + smd['keywords'] + smd['genres']
smd['alldata'] = smd['alldata'].apply(lambda x : ' '.join(x))

- CountVectorizer does NOT normalize vectors!
- Therefore, cosine_similarity must be used for calculating word frequencies!


In [153]:
count = CountVectorizer(analyzer='word',min_df=0.0, stop_words='english')
count_matrix = count.fit_transform(smd['alldata'])

In [154]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
cosine_sim2.shape

(9219, 9219)

In [155]:
smd = smd.reset_index()
titles2 = smd['title']
indices2 = pd.Series(smd.index, index=smd['title'])

In [156]:
# we defined function before:
recommendN('The Dark Knight', cosine_sim2, titles2, indices2, N=10)

Unnamed: 0,title
7991,The Dark Knight Rises
6186,Batman Begins
6587,The Prestige
2077,Following
7608,Inception
4125,Insomnia
3373,Memento
8573,Interstellar
7619,Batman: Under the Red Hood
1122,Batman Returns


- Taking popularity into consideration as well:

In [157]:
def recommendN_improved(title, cosine_sim, titles, indices):
  idx = indices[title]
  similarities = list(enumerate(cosine_sim[idx]))
  similarities = sorted(similarities, key=lambda x : x[1], reverse=True)
  similarities = similarities[:50]

  movies_ids = [i[0] for i in similarities]
  movies = smd.iloc[movies_ids][['title', 'vote_count', 'vote_average', 'year']]

  vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
  vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
  C = vote_averages.mean()
  m = vote_counts.quantile(0.60)

  qualified = movies[(movies['vote_count'].notnull()) & (movies['vote_count'] >= m) & (movies['vote_average'].notnull())]
  qualified['vote_count'] = qualified['vote_count'].astype('int')
  qualified['vote_average'] = qualified['vote_average'].astype('int')
  qualified['imdb_rating'] = qualified.apply(imdb_rating, axis=1)

  print(movies[movies['title'] == 'Inception'])

  qualified = qualified.sort_values('imdb_rating', ascending=False)
  return qualified

In [158]:
recommendN_improved('The Dark Knight', cosine_sim2, titles2, indices2).head(10)

          title  vote_count  vote_average  year
7608  Inception     14075.0           8.1  2010


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_count'] = qualified['vote_count'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_average'] = qualified['vote_average'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_rating'] = qualified.apply(imdb_rating, axis=1)


Unnamed: 0,title,vote_count,vote_average,year,imdb_rating
7608,Inception,14075,8,2010,7.917588
6945,The Dark Knight,12269,8,2008,7.905871
8573,Interstellar,11187,8,2014,7.897107
6587,The Prestige,4510,8,2006,7.758148
3373,Memento,4168,8,2000,7.740175
7991,The Dark Knight Rises,9263,7,2012,6.921448
6186,Batman Begins,7511,7,2005,6.904127
524,Batman,2145,7,1989,6.704647
7395,Law Abiding Citizen,1522,7,2009,6.610575
8375,Man of Steel,6462,6,2013,5.952478


## Collaborative Filtering
- we want our recommender to recommend movies across genres (to get to know our taste)
- we want it to be PERSONAL
- we use collaborative filtering to make recommendations to SPECIFIC MOVIE WATCHERS
- idea: users similar to ME possibly watched movies that I may like (the movies I didn't watch)

In [167]:
ratings = pd.read_csv(os.path.join(movie_dir, 'ratings_small.csv'))

In [168]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [170]:
user_movie_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')

In [175]:
#user_movie_matrix

### Calculating similarties between users

In [184]:
user_similarity = cosine_similarity(user_movie_matrix.fillna(0))

user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)
user_similarity_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.074482,0.016818,0.0,0.083884,0.0,0.012843,0.0,...,0.0,0.0,0.014474,0.043719,0.0,0.0,0.0,0.062917,0.0,0.017466
2,0.0,1.0,0.124295,0.118821,0.103646,0.0,0.212985,0.11319,0.113333,0.043213,...,0.477306,0.063202,0.077745,0.164162,0.466281,0.425462,0.084646,0.02414,0.170595,0.113175
3,0.0,0.124295,1.0,0.08164,0.151531,0.060691,0.154714,0.249781,0.134475,0.114672,...,0.161205,0.064198,0.176134,0.158357,0.177098,0.124562,0.124911,0.080984,0.136606,0.170193
4,0.074482,0.118821,0.08164,1.0,0.130649,0.079648,0.319745,0.191013,0.030417,0.137186,...,0.114319,0.047228,0.136579,0.25403,0.121905,0.088735,0.068483,0.104309,0.054512,0.211609
5,0.016818,0.103646,0.151531,0.130649,1.0,0.063796,0.095888,0.165712,0.086616,0.03237,...,0.191029,0.021142,0.146173,0.224245,0.139721,0.058252,0.042926,0.038358,0.062642,0.225086


In [225]:
def predict_rating(user_id, movie_id):
  movie_ratings = user_movie_matrix[movie_id]

  if not pd.isna(movie_ratings[user_id]):
    return movie_ratings[user_id]

  similarities = user_similarity_df[user_id]

  # we need users that rated that movie
  valid_users = movie_ratings.dropna().index

  similarities = similarities[valid_users]  # taking out only similarities with users that rated the movie
  ratings = movie_ratings[valid_users]

  # no one rated the movie (no valid users)
  if similarities.sum() == 0:
    return np.nan

  rating = np.dot(similarities, ratings) / similarities.sum()
  return rating

In [226]:
predict_rating(1,30)

np.float64(4.062867567652092)

In [287]:
def recommend(target_user_id, user_movie_matrix, user_similarity_df, N=5):
  similarities = user_similarity_df.loc[target_user_id]   # slicnosti naseg korisnika sa ostalima

  ratings = user_movie_matrix.copy()
  ratings = ratings.drop(index=target_user_id) # ocene svakog korisnika za svaki film BEZ naseg korisnika
  ratings = ratings.fillna(0)
  sim_scores = similarities.drop(index=target_user_id)

  weighted_ratings = np.dot(ratings.T, sim_scores) / sim_scores.sum()
  print(np.count_nonzero(~np.isnan(weighted_ratings)))

  usr_movies = user_movie_matrix.loc[target_user_id]
  unseen_movies = usr_movies[usr_movies.isna()]

  # preporuke vracaju: movie_id

  recommendations = [ (score, movie_id) for score, movie_id in zip(weighted_ratings, ratings.columns) if usr_movies[movie_id]]
  recommendations.sort(key = lambda x: x[0], reverse=True)

  return recommendations[:N]

In [289]:
recommend(640,user_movie_matrix, user_similarity_df, 10)

9066


[(np.float64(2.6943917368704136), 260),
 (np.float64(2.405026707019005), 608),
 (np.float64(2.232331266462549), 780),
 (np.float64(2.10837617544788), 1),
 (np.float64(2.0381055511386994), 296),
 (np.float64(1.9881283876534719), 356),
 (np.float64(1.9820092828798288), 318),
 (np.float64(1.8586541134653072), 32),
 (np.float64(1.8325189650896812), 593),
 (np.float64(1.7895451533497708), 648)]

In [290]:
# i hope it works :)

## Hybrid Recommender
- input: user id + movie title
