### Info
Implementacja na podstawie:
https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea

### Importy

In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

### Wczytanie zbioru danych

In [2]:
movies = pd.read_csv("../data/movies_metadata.csv",usecols=['id','title'])

In [3]:
movies

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II
...,...,...
45461,439050,Subdue
45462,111109,Century of Birthing
45463,67758,Betrayal
45464,227506,Satan Triumphant


In [4]:
movies = movies.drop([19730, 29503, 35587])
movies = movies.drop_duplicates(subset="title")
movies = movies.astype({'id': 'int32'})

In [5]:
links = pd.read_csv("../data/links.csv",usecols=['movieId','tmdbId'])

In [6]:
movies = pd.merge(movies, links, left_on='id', right_on='tmdbId')

In [7]:
movies

Unnamed: 0,id,title,movieId,tmdbId
0,862,Toy Story,1,862.0
1,8844,Jumanji,2,8844.0
2,15602,Grumpier Old Men,3,15602.0
3,31357,Waiting to Exhale,4,31357.0
4,11862,Father of the Bride Part II,5,11862.0
...,...,...,...,...
42301,222848,Caged Heat 3000,176263,222848.0
42302,439050,Subdue,176269,439050.0
42303,111109,Century of Birthing,176271,111109.0
42304,227506,Satan Triumphant,176275,227506.0


In [8]:
ratings = pd.read_csv("../data/ratings.csv", usecols=['userId','movieId','rating'])

In [9]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0
...,...,...,...
26024284,270896,58559,5.0
26024285,270896,60069,5.0
26024286,270896,63082,4.5
26024287,270896,64957,4.5


### Filtrowanie danych
Usunięcie nieaktywnych użytkowników i mało popularnych filmów

In [10]:
movies_cnt = pd.DataFrame(ratings.groupby('movieId').size(), columns=['count'])
users_cnt = pd.DataFrame(ratings.groupby('userId').size(), columns=['count'])

In [11]:
print(movies_cnt['count'].mean())
print(movies_cnt['count'].median())

576.8433780339134
8.0


In [12]:
print(users_cnt['count'].mean())
print(users_cnt['count'].median())

96.06745393065974
30.0


In [13]:
movies_in_metadata = list(movies['movieId'])
movies_ids = movies_cnt[movies_cnt['count']>100].reset_index()
movies_ids = list(movies_ids['movieId'])
users_ids = users_cnt[users_cnt['count']>60].reset_index()
users_ids = list(users_ids['userId'])

In [14]:
ratings_filtered = ratings[ratings['movieId'].isin(movies_in_metadata)]
ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(movies_ids)]

In [15]:
ratings_filtered = ratings_filtered[ratings_filtered['userId'].isin(users_ids)]

In [16]:
ratings_filtered

Unnamed: 0,userId,movieId,rating
59,4,223,4.0
60,4,415,4.0
61,4,648,4.0
62,4,1097,5.0
63,4,1197,4.0
...,...,...,...
26024283,270896,56367,4.5
26024284,270896,58559,5.0
26024285,270896,60069,5.0
26024286,270896,63082,4.5


### Transfomacja danych na macierz movies x users

In [17]:
movie_features = ratings_filtered.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

In [18]:
movie_features

userId,4,8,9,11,12,15,16,20,24,27,...,270871,270872,270877,270879,270885,270887,270892,270893,270894,270896
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,4.0,4.5,0.0,4.0,0.0,0.0,4.0,4.0,3.5,...,5.0,3.5,0.0,3.0,0.0,5.0,4.0,4.0,0.0,4.5
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,2.5,0.0,5.0,3.5,0.0,5.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
170875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Rekomendacje KNN

In [19]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)

In [20]:
model_knn2 = NearestNeighbors(metric='euclidean', algorithm='brute', n_neighbors=10, n_jobs=-1)

In [21]:
mapper = {}
for i, movie in enumerate(list(movies.set_index('movieId').loc[movie_features.index].title)):
    mapper[movie] = i 


In [22]:
movie_features_sparse = csr_matrix(movie_features.values)

In [23]:
def make_recommendations(fav_movie, n_recommendations, data, model, mapper):
    model.fit(data)
    idx = mapper[fav_movie]
    distances, indices = model.kneighbors(data[idx],n_neighbors=n_recommendations+1)
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
    reverse_mapper = {v: k for k, v in mapper.items()}
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance '
              'of {2}'.format(i+1, reverse_mapper[idx], dist))

In [24]:
make_recommendations(fav_movie = 'Toy Story',
                    n_recommendations = 10,
                    data = movie_features_sparse,
                    model = model_knn,
                    mapper = mapper)

Recommendations for Toy Story:
1: Raiders of the Lost Ark, with distance of 0.39478992865626994
2: The Empire Strikes Back, with distance of 0.3870329803899609
3: Aladdin, with distance of 0.38366165661276097
4: Return of the Jedi, with distance of 0.38135760775342153
5: The Lion King, with distance of 0.3808683437362044
6: Back to the Future, with distance of 0.3645097467208983
7: Toy Story 2, with distance of 0.3636113272124124
8: Jurassic Park, with distance of 0.36159997661854704
9: Star Wars, with distance of 0.35964254950651586
10: Forrest Gump, with distance of 0.35227740935897256


In [25]:
make_recommendations(fav_movie = 'Toy Story',
                    n_recommendations = 10,
                    data = movie_features_sparse,
                    model = model_knn2,
                    mapper = mapper)

Recommendations for Toy Story:
1: Beauty and the Beast, with distance of 723.8730897056472
2: Back to the Future, with distance of 722.9922198751519
3: Shrek, with distance of 720.6465846724037
4: Independence Day, with distance of 719.6255970989359
5: Men in Black, with distance of 715.8372720108949
6: Monsters, Inc., with distance of 715.5967789195254
7: A Bug's Life, with distance of 714.571375021418
8: The Lion King, with distance of 697.3632482429799
9: Aladdin, with distance of 690.6100563994127
10: Toy Story 2, with distance of 657.1571349380603
