# Laboratorium 1 - content-based recommender

## Przygotowanie

 * dataset i potrzebne biblioteki są dokładnie takie same jak na poprzednim laboratorium
 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold

In [2]:
# liczba parametrow opisujacych filmy i uzytkownikow zalezy tylko od nas
K = 20
#TUTAJ MODYFIKOWAĆ W CELY SPRAWDZANIA WYNIKÓW

In [3]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
75516,477,919,3.5
55738,368,2871,3.0
66339,427,5420,1.5
88777,573,2617,5.0
46511,305,103883,4.0
...,...,...,...
93270,599,1982,3.0
64057,414,5991,5.0
3610,21,116823,3.5
7440,51,317,3.5


In [4]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings, k):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, k)), index=raw_ratings['userId'].unique(), columns=['x%s' % i for i in range(k)])
    users.sort_index(inplace=True) 
    return users_no, users

users_no, users = initialize_users(train_ratings_set, K)
users

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19
1,1.110203,3.506261,3.570043,4.592650,0.795316,0.957287,3.193229,1.568782,2.352057,1.553285,3.841654,1.492015,2.046596,2.138152,4.946966,1.444489,4.549078,2.708099,1.145649,1.862785
2,4.941447,1.025039,2.105227,4.550648,3.474184,0.203615,3.119536,4.672571,1.242056,1.606760,1.766839,1.572183,3.095401,1.402827,3.982642,4.254800,3.191162,0.927655,0.464113,1.245644
3,2.066286,0.589144,3.725635,3.372975,0.457504,2.616824,2.736333,1.860834,0.026940,0.193504,4.833279,0.878239,3.203211,0.901519,4.200406,3.796019,0.276881,1.154998,4.872253,3.382613
4,1.004961,1.351600,4.144859,0.294723,0.594021,1.206772,0.705764,4.925939,0.833485,4.580382,3.114430,0.016390,4.816444,4.229722,4.095262,1.468925,4.647866,2.008118,2.169082,1.680015
5,2.843157,1.766735,1.039933,4.943494,4.337646,3.674298,4.654769,0.710974,1.016835,2.125444,4.499067,1.478637,2.777160,2.835105,2.402916,2.073992,1.573693,4.897942,3.592073,1.534807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,1.823779,0.290523,4.361327,0.962307,4.655948,3.143632,4.407533,2.901493,4.601859,4.859549,2.787841,3.207624,1.997344,1.237859,2.686602,1.584102,2.666881,4.987977,2.274059,3.450766
607,4.532763,1.662901,1.116353,0.022873,4.600083,2.826403,3.395346,2.247389,3.408102,4.803965,2.386625,0.623307,0.221206,3.228488,3.618873,4.200619,0.932811,1.683067,4.495032,0.638668
608,2.357573,0.455473,1.482388,4.071613,4.367014,3.717724,3.254841,4.036124,3.738765,1.489567,4.599740,2.461869,4.487241,4.013155,4.239414,3.260328,4.563572,1.409968,4.372387,0.008672
609,2.737693,3.601443,2.428878,0.468839,1.909563,4.020403,0.217905,1.640654,0.102074,3.799044,2.632416,1.836476,4.249331,3.321861,2.265967,4.624072,3.443814,2.442193,4.245260,4.544131


In [5]:
# inicjalizujemy macierz cech filmow liczbami losowymi z przedzialu [0.0, 1.0]

def initialize_movies(raw_ratings, k):
    movies_no = raw_ratings['movieId'].unique().size
    movies = pandas.DataFrame(np.random.uniform(size=(movies_no, k)), index=raw_ratings['movieId'].unique(), columns=['x%s' % i for i in range(k)])
    movies.sort_index(inplace=True) 
    return movies_no, movies

movies_no, movies = initialize_movies(train_ratings_set, K)
movies

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19
1,0.509111,0.168840,0.616883,0.629014,0.437851,0.116565,0.898884,0.434219,0.500878,0.523486,0.456090,0.678966,0.533647,0.707473,0.007022,0.167803,0.984018,0.107290,0.822900,0.365813
2,0.402838,0.952152,0.201864,0.840278,0.225320,0.625537,0.137447,0.075292,0.249091,0.955709,0.865323,0.519776,0.416911,0.236048,0.586038,0.583116,0.497088,0.612597,0.202338,0.970492
3,0.495615,0.475794,0.804292,0.360111,0.160842,0.755450,0.702569,0.693702,0.468821,0.421263,0.663497,0.006598,0.231140,0.148509,0.427071,0.619653,0.127780,0.693715,0.884941,0.890416
4,0.177794,0.162489,0.650387,0.938034,0.581193,0.773882,0.273836,0.195392,0.860543,0.371827,0.909851,0.062315,0.934245,0.285751,0.058696,0.321418,0.136055,0.076916,0.601196,0.007188
5,0.298134,0.002064,0.420755,0.097041,0.150868,0.242339,0.420634,0.080178,0.742265,0.675119,0.354033,0.929605,0.970861,0.165807,0.577783,0.433444,0.997010,0.940344,0.354269,0.762479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.500034,0.563232,0.201769,0.239441,0.252773,0.423429,0.119169,0.213732,0.569657,0.620292,0.380720,0.219737,0.374131,0.275169,0.668924,0.864361,0.847969,0.139629,0.824622,0.828228
193583,0.630747,0.551874,0.359024,0.309990,0.964696,0.902953,0.617598,0.056690,0.069684,0.493272,0.019822,0.754977,0.348875,0.753029,0.334665,0.930208,0.624456,0.948577,0.881623,0.324292
193585,0.875768,0.091784,0.890787,0.658923,0.070489,0.758358,0.474920,0.951499,0.493123,0.508530,0.044926,0.106112,0.401064,0.768965,0.702847,0.116385,0.258536,0.682288,0.076866,0.143128
193587,0.393563,0.395235,0.626296,0.340462,0.701536,0.706874,0.224995,0.827814,0.032588,0.964584,0.502820,0.393018,0.357513,0.830461,0.429142,0.436329,0.197179,0.100934,0.742497,0.919221


In [6]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow i uzytkownikow moze brakowac po podziale datasetu na dwie czesci
#   - byc moze warto uzupelnic brakujace kolumny i wiersze

def get_ratings(raw_ratings, movies, nan=False):
    ratings = raw_ratings.pivot(*raw_ratings.columns)
    if not nan:
        ratings = ratings.fillna(0.0)
    # ...
    return ratings

ratings = get_ratings(train_ratings_set, movies)
ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Część 2. - trening modelu

In [17]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent

alpha = 0.00003 # learning speed
delta = 500000 # minimal upgrade for each step
lambd = 0.01 # regularization weight

def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    users_model = users.copy()
    movies_model = movies.copy()
    
    while(True):
        previous_total_error = total_error

        predicted_ratings = np.dot(users_model, movies.T)# ...
        errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
        users_gradient = np.dot(errors, movies)# ...
        movies_gradient = np.dot(errors.T, users)# ...
#         print(users_gradient.shape)
#         print(movies_gradient.shape)
        
        # zauwaz, ze nie uzywamy biasow i nie potrzebujemy dodatkowej macierzy do regularyzacji
        #  - wystarczy, ze uzyjemy odpowiednio macierzy users_model i movies_model
        
        # musimy zaktualizowac dwa modele
        
        users_model = users_model - alpha * (users_gradient + lambd * users_model)# ...
        movies_model = movies_model - alpha * (movies_model + lambd * movies_model)# ...

        total_error = np.sum(errors ** 2)
        print(total_error)
        progress = abs(previous_total_error - total_error)
        if progress < delta:
            break
            
    return users_model, movies_model

users_model, movies_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

47525643.041930884
39339970.346074216
33953589.07227361
30086944.97015732
27127243.00814314
24757892.07980353
22801129.2700446
21148673.383626867
19729683.921212487
18495071.595952433
17409289.572067183
16445730.50591377
15583969.018487269
14808010.952387568
14105127.422757868
13465048.420888186
12879388.41814124
12341227.643098246
11844801.103357602


## Część 3. - podobieństwo elementów

In [21]:
# przygotujmy funkcje obliczajaca odleglosc cosinusowa miedzy kazda para elementow (filmow lub uzytkownikow)
def my_norm(vectors):
    

def cosine_similarity(vectors):
    # przydadza nam sie dlugosci wektorow
    vectors = np.ma.array(vectors, mask=np.isnan(vectors)) #TO JEST CHYBA POTRZEBNE TYLKO W NASTĘPNYM KROKU
    lengths = np.linalg.norm(vectors, axis=1)
    # podobienstwo liczymy w dwoch krokach - najpierw liczymy iloczyn skalarny kazdej pary wektorow
    dot_products = vectors.dot(vectors.T)# ...
    # nastepnie dzielimy zarowno wiersze jak i kolumny przez dlugosci wektorow - przyda sie zmienna lengths oraz funkcja divide()
    similarity = dot_products.divide(lengths) # ...
    similarity = similarity.divide(lengths.T)
    return similarity

cosine_similarity(movies_model)

MemoryError: Unable to allocate array with shape (9544, 9544) and data type float64

In [20]:
# teraz mozemy znalexc k elementow najbardziej podobnych do danego

def k_most_similar(vectors, i, k):
    sim_matrix = cosine_similarity(vectors)
    ith = sim_matrix[i]
    # przyda sie funkcja np.argsort()
    return vectors.argsort(ith)[:k]# ...

k_most_similar(movies, 193587, 8)

MemoryError: Unable to allocate array with shape (9544, 9544) and data type float64

## Część 4. - Item2Item collaborative filtering

In [None]:
# sprobujmy innego podejscia - Item2Item CF przewiduje rating tylko na podstawie macierzy ratingow, bez koniecznosci trenowania
#   dodatkowych macierzy

# zauwaz, ze nie chcemy przeprowadzac obliczen tam, gdzie brakuje nam elementow
#   - oblicz macierz ratings z parametrem nan=True oraz wykorzystaj tzw. masked arrays: np.ma.array(x, mask=np.isnan(x))
#   w ten sposob unikniesz przeprowadzania niepotrzebnych obliczen

def item_to_item(ratings):
    similarity = cosine_similarity(ratings.T) # prawdopodobnie bedziesz musial zmodyfikowac te funkcje, by obslugiwala NaN
    sums = similarity.sum(axis=1)
    model = # srednia ocen wystawionych przez uzytkownika wazona podobienstwem elementow
    return model

item_to_item(ratings)

## Część 5. - porównanie algorytmów

In [None]:
# korzystając z funkcji z poprzedniego laboratorium, porownaj dwa zaimplementowane algorytmy Collaborative Filtering