# Laboratorium 2 - collaborative filtering

## Przygotowanie

 * dataset i potrzebne biblioteki są dokładnie takie same jak na poprzednim laboratorium
 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold

In [87]:
# liczba parametrow opisujacych filmy i uzytkownikow zalezy tylko od nas
K = 5

# 5, 10, 20 -> na oko podobne wyniki
# 50 -> szybko wszedl na inf

In [88]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
33111,226,543,4.0
23880,166,377,3.0
95566,600,3174,3.0
68960,448,2034,3.0
2889,19,3358,5.0
...,...,...,...
50711,328,1032,3.0
51720,334,3996,3.5
20950,139,6059,1.5
20276,133,296,3.0


In [89]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings, k):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, k)), index=raw_ratings['userId'].unique(), columns=['x%s' % i for i in range(k)])
    users.sort_index(inplace=True) 
    return users_no, users

users_no, users = initialize_users(train_ratings_set, K)
users

Unnamed: 0,x0,x1,x2,x3,x4
1,2.681136,2.828714,4.278706,3.623512,2.116507
2,2.622479,2.954114,3.473317,3.125606,1.202998
3,2.380491,2.480452,2.762428,4.323306,2.059382
4,0.937536,3.607910,1.245137,0.165647,1.579126
5,4.964519,1.374548,1.210816,0.499638,0.940124
...,...,...,...,...,...
606,3.339423,2.603908,1.589616,0.993871,1.669647
607,0.616166,1.520947,0.332262,0.602983,1.656039
608,1.874116,2.665859,2.178331,1.261884,0.689883
609,4.979316,3.853527,3.808487,0.302741,0.085191


In [90]:
# inicjalizujemy macierz cech filmow liczbami losowymi z przedzialu [0.0, 1.0]

def initialize_movies(raw_ratings, k):
    movies_no = raw_ratings['movieId'].unique().size
    movies = pandas.DataFrame((1/K)* np.random.uniform(size=(movies_no, k)), index=raw_ratings['movieId'].unique(), columns=['x%s' % i for i in range(k)])
    movies.sort_index(inplace=True) 
    
    # uzupelniamy brakujace filmy - dostaja 0.0 ze wszystkich cech
    missing_movies = set(all_ratings['movieId']).difference(set(movies.index))
    for movie in missing_movies:
        movies = movies.append(pandas.DataFrame([[0.0 for _ in range(k)]], index=[movie], columns=['x%s' % i for i in range(k)]))
    movies_no = all_ratings['movieId'].unique().size
    
    return movies_no, movies

movies_no, movies = initialize_movies(train_ratings_set, K)
movies

Unnamed: 0,x0,x1,x2,x3,x4
1,0.005303,0.085775,0.148917,0.097124,0.057620
2,0.073135,0.024360,0.088759,0.098710,0.146586
3,0.171752,0.129558,0.004229,0.087997,0.102831
4,0.036258,0.002292,0.084460,0.023645,0.084376
5,0.194076,0.097679,0.007205,0.083580,0.192471
...,...,...,...,...,...
6123,0.000000,0.000000,0.000000,0.000000,0.000000
26095,0.000000,0.000000,0.000000,0.000000,0.000000
68597,0.000000,0.000000,0.000000,0.000000,0.000000
92665,0.000000,0.000000,0.000000,0.000000,0.000000


In [91]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow i uzytkownikow moze brakowac po podziale datasetu na dwie czesci
#   - byc moze warto uzupelnic brakujace kolumny i wiersze

def get_ratings(raw_ratings, movies, nan=False):
    ratings = raw_ratings.pivot(*raw_ratings.columns)
    
    # brakujace elementy -> nan
    missing_movies = set(all_ratings['movieId']).difference(set(raw_ratings['movieId']))
    for movie in missing_movies:
        ratings[movie] = np.nan
    ratings = ratings.reindex(sorted(ratings.columns), axis=1)
    
    if not nan:
        ratings = ratings.fillna(0.0)

    return ratings

ratings = get_ratings(train_ratings_set, movies)
ratings

  ratings[movie] = np.nan


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Część 2. - trening modelu

In [92]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent

# alpha = 0.00003 # learning speed
# delta = 100 # minimal upgrade for each step
# lambd = 0.01 # regularization weight

alpha = 0.00005 # learning speed
delta = 25 # minimal upgrade for each step
lambd = 50 # regularization weight

def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    users_model = users.copy()
    movies_model = movies.copy()
    
    counter = 0
    
    while(True):
        previous_total_error = total_error

        predicted_ratings = users_model.dot(movies_model.transpose())
        errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
        users_gradient = errors.dot(movies_model)
        movies_gradient = errors.transpose().dot(users_model)
        
        # zauwaz, ze nie uzywamy biasow i nie potrzebujemy dodatkowej macierzy do regularyzacji
        #  - wystarczy, ze uzyjemy odpowiednio macierzy users_model i movies_model
        
        # musimy zaktualizowac dwa modele
        
        users_model -= alpha * (users_gradient + lambd * users_model)
        movies_model -= alpha * (movies_gradient + lambd * movies_model)

        total_error = np.sum(errors ** 2)
        print(total_error)
        progress = abs(previous_total_error - total_error)
        if progress < delta or counter > 20:
            break
        counter += 1
            
    return users_model, movies_model

users_model, movies_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

609779.8753105275
582633.2425835753
561864.7781093994
546052.0337624672
534340.8958044775
526212.5352077287
521369.954931016
519681.14788217866
521150.0856545513
525902.2663646356
534178.7097259135
546335.6908914766
562849.220391514
584324.2568343269
611509.3366855521
645317.9599433654
686858.8358303838
737478.1068997064
798818.1023873502
872899.2700942768
962235.094712018
1069994.6826296304


## Część 3. - podobieństwo elementów

In [93]:
# przygotujmy funkcje obliczajaca odleglosc cosinusowa miedzy kazda para elementow (filmow lub uzytkownikow)

def cosine_similarity(vectors):
    # przydadza nam sie dlugosci wektorow
    # poniewaz w kolejnej czesci bedziemy korzystac z masked arrays, nie mozemy uzyc najprostszej metody
    # lengths = np.linalg.norm(vectors, axis=1)
    # musimy zaimplementowac to sami
    lengths = np.sqrt(np.sum(vectors ** 2, axis=1))
            
    # podobienstwo liczymy w dwoch krokach - najpierw liczymy iloczyn skalarny kazdej pary wektorow
    dot_products = vectors.dot(vectors.transpose())
    # nastepnie dzielimy zarowno wiersze jak i kolumny przez dlugosci wektorow - przyda sie zmienna lengths oraz funkcja divide()
    similarity = (dot_products / lengths).transpose() / lengths
    return similarity

cosine_similarity(movies_model)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,26593,72165,60389,132584,129514,6123,26095,68597,92665,27134
1,1.000000,0.983329,0.956388,0.928357,0.931160,0.975890,0.981417,0.917352,0.963749,0.986964,...,0.892645,0.892639,0.892655,0.892649,0.892636,0.892656,0.892651,0.892659,0.892572,0.907322
2,0.983329,1.000000,0.979561,0.948907,0.975217,0.970880,0.957894,0.877673,0.970474,0.972583,...,0.942317,0.942312,0.942324,0.942320,0.942310,0.942325,0.942320,0.942327,0.942261,0.901107
3,0.956388,0.979561,1.000000,0.876496,0.987869,0.987851,0.926017,0.828634,0.938352,0.932900,...,0.928875,0.928870,0.928882,0.928878,0.928868,0.928883,0.928878,0.928885,0.928819,0.877513
4,0.928357,0.948907,0.876496,1.000000,0.901969,0.864042,0.902141,0.856313,0.937988,0.953026,...,0.861486,0.861481,0.861495,0.861490,0.861479,0.861495,0.861491,0.861498,0.861426,0.811742
5,0.931160,0.975217,0.987869,0.901969,1.000000,0.958578,0.885023,0.777583,0.946054,0.925763,...,0.937635,0.937631,0.937641,0.937638,0.937629,0.937642,0.937638,0.937644,0.937588,0.825445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6123,0.892656,0.942325,0.928883,0.861495,0.937642,0.887653,0.852159,0.723694,0.924996,0.871856,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.856708
26095,0.892651,0.942320,0.928878,0.861491,0.937638,0.887647,0.852154,0.723687,0.924992,0.871851,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.856705
68597,0.892659,0.942327,0.928885,0.861498,0.937644,0.887656,0.852162,0.723698,0.924998,0.871860,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.856710
92665,0.892572,0.942261,0.928819,0.861426,0.937588,0.887569,0.852074,0.723592,0.924933,0.871767,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.856663


In [94]:
# teraz mozemy znalexc k elementow najbardziej podobnych do danego

def k_most_similar(vectors, i, k):
    sim_matrix = cosine_similarity(vectors)
    # przyda sie funkcja np.argsort()
    return sim_matrix[i].argsort()[:k] # ...

k_most_similar(movies, 193587, 8)

1    2895
2    7635
3     794
4    3395
5    7843
6    6347
7    5693
8     902
Name: 193587, dtype: int64

## Część 4. - Item2Item collaborative filtering

In [95]:
# sprobujmy innego podejscia - Item2Item CF przewiduje rating tylko na podstawie macierzy ratingow, bez koniecznosci trenowania
#   dodatkowych macierzy

# zauwaz, ze nie chcemy przeprowadzac obliczen tam, gdzie brakuje nam elementow
#   - oblicz macierz ratings z parametrem nan=True oraz wykorzystaj tzw. masked arrays: np.ma.array(x, mask=np.isnan(x))
#   w ten sposob unikniesz przeprowadzania niepotrzebnych obliczen

ratings = get_ratings(train_ratings_set, movies, nan=True)

def item_to_item(ratings):
    masked_ratings = np.ma.array(ratings, mask=np.isnan(ratings))
    similarity = cosine_similarity(masked_ratings.T) # prawdopodobnie bedziesz musial zmodyfikowac te funkcje, by obslugiwala NaN
    sums = similarity.sum(axis=1)
    model = masked_ratings.dot(similarity) / sums # srednia ocen wystawionych przez uzytkownika wazona podobienstwem elementow
    return model

i2i_model = item_to_item(ratings)

  ratings[movie] = np.nan


## Część 5. - porównanie algorytmów

In [96]:
# korzystając z funkcji z poprzedniego laboratorium, porownaj dwa zaimplementowane algorytmy Collaborative Filtering
positive_threshold = 4.0
negative_threshold = 2.0

def calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold):
    
    # obliczamy true_positives itp.
    # nastepnie wszystkie 
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    for row in test_ratings_set.iterrows():

        userId = row[1]["userId"]
        movieId = row[1]["movieId"]
        rating = row[1]["rating"]
        pred_rating = predicted_ratings[int(movieId)][int(userId)]
        
        if 2.0 < rating < 4.0:
            continue
        
        if pred_rating <= negative_threshold:
            if rating <= negative_threshold:
                tn += 1
            else:
                fn +=1
        elif pred_rating >= positive_threshold:
            if rating >= positive_threshold:
                tp += 1
            else:
                fp += 1

    accuracy = (tp + tn) / (tp + fp + tn + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    try:
        f1 = (2 * recall * precision) / (recall + precision)
    except ZeroDivisionError:
        f1 = 0
    return {
        'true_positives': tp,
        'true_negatives': tn,
        'false_positives': fp,
        'false_negatives': fn,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [98]:
# # regular CF
regular_ratings = users_model.dot(movies_model.T)
print("Regular CF stats")
print(calculate_stats(test_ratings_set, regular_ratings, positive_threshold, negative_threshold))

# # Item-To-Item CF
print("Item-To-Item CF stats")
item_to_item_ratings = pandas.DataFrame(i2i_model, columns=movies.index, index=users.index)
print(calculate_stats(test_ratings_set, item_to_item_ratings, positive_threshold, negative_threshold))


Regular CF stats
{'true_positives': 277, 'true_negatives': 462, 'false_positives': 52, 'false_negatives': 1576, 'accuracy': 0.3122095479509928, 'precision': 0.8419452887537994, 'recall': 0.149487317862925, 'f1': 0.2538955087076077}
Item-To-Item CF stats
{'true_positives': 1, 'true_negatives': 663, 'false_positives': 0, 'false_negatives': 2373, 'accuracy': 0.21863681264405663, 'precision': 1.0, 'recall': 0.00042122999157540015, 'f1': 0.0008421052631578948}


In [None]:
# Wyniki
# Regular lepsze niz I2I
#regular recall 0.14948731786292
#Item to item recall 0.00042122999157540015

# random z poprzednich labow na poziomie ok. 0.271122570408568
# content based na poziomie ok. 0.3183260610868703