# Laboratorium 1 - content-based recommender

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold

In [2]:
# tworzymy reprezentacje filmow jako wektorow cech - na podstawie gatunkow

genres = [
    '(no genres listed)', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'IMAX', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War', 
    'Western'
]
genres_no = len(genres)

movies = pandas.read_csv('ml-latest-small/movies.csv')
movies_no = movies.shape[0]

movies['bias'] = 1.0
for genre in genres:
    movies[genre] = np.where(movies['genres'].str.contains(genre, regex=False), 1.0, 0.0)
    
movies = movies.drop(columns=['title', 'genres']).set_index('movieId')
movies

Unnamed: 0_level_0,bias,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
78406,488,1223,4.5
32082,220,648,3.0
42878,288,4019,3.0
61713,408,160563,3.5
31601,219,913,4.5
...,...,...,...
64639,414,54256,1.5
15768,103,4881,5.0
93372,599,2334,2.5
29432,202,537,3.0


In [4]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, genres_no+1)), index=raw_ratings['userId'].unique(), columns=['bias']+genres)
    return users_no, users

users_no, users = initialize_users(train_ratings_set)
users

Unnamed: 0,bias,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
427,1.889738,3.180737,2.599051,1.183608,1.140261,2.103706,2.890012,4.915758,3.186870,4.624745,...,1.760264,3.266167,3.216441,1.287855,3.947325,2.113970,1.435626,1.377056,3.935570,3.325519
357,1.489289,0.272802,3.408779,0.940798,1.999175,0.216589,3.972829,0.850834,2.764227,3.814690,...,4.524383,4.522930,4.378807,1.560737,4.654868,3.673750,4.088215,1.699955,0.125793,0.018443
177,1.560249,1.225888,0.436308,4.732954,0.476936,0.539614,0.618855,1.362431,0.648818,3.470221,...,3.665584,3.453128,3.778277,2.322913,0.914471,0.211495,1.190987,3.536493,4.428607,2.464198
282,0.917688,0.974414,4.189947,2.718232,4.880500,2.320363,2.304441,2.825639,0.449559,3.516685,...,3.649542,4.399842,2.069391,1.554769,4.985934,0.463658,3.813881,4.963195,1.615923,4.881246
414,3.072434,3.946296,1.958489,0.956891,2.001080,1.652357,0.015941,4.696303,2.575890,1.894580,...,1.382204,3.625066,4.847774,3.568418,1.599265,1.763447,0.881970,3.895336,1.338887,4.628302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,3.417806,0.505694,4.439568,1.447335,4.098696,4.245801,2.662133,0.491958,4.341223,1.325881,...,2.909160,1.910162,0.575130,4.009465,2.416158,3.929605,3.678713,3.110752,3.454008,2.746524
547,2.619386,1.492414,0.345166,1.059544,0.046263,2.194732,0.221412,4.438392,0.342579,0.196487,...,1.685270,0.599855,2.068183,0.241047,0.215065,2.761273,4.551661,3.613004,4.687985,0.530253
557,2.291026,2.999587,1.968188,0.910130,4.030467,2.823557,2.184802,1.776955,2.509543,3.148025,...,1.155275,1.637929,1.643671,2.651110,4.374394,2.210309,3.353277,4.089236,1.702532,3.405576
544,4.782991,1.985777,1.166136,2.437272,3.228261,0.604195,1.447623,1.755195,4.289584,2.989166,...,3.497404,3.335984,1.946167,4.873460,4.556294,0.358003,4.185561,4.658200,2.897081,1.099907


In [5]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow moze brakowac po podziale datasetu na dwie czesci - musimy uzueplnic brakujace kolumny

def get_ratings(raw_ratings, movies):
    ratings = raw_ratings.pivot(*raw_ratings.columns).fillna(0.0)
    missing_movies = set(movies.index).difference(set(raw_ratings['movieId']))
    for movie in missing_movies:
        ratings[movie] = 0.0
    ratings = ratings.reindex(sorted(ratings.columns), axis=1)
    return ratings

ratings = get_ratings(train_ratings_set, movies)
ratings

  ratings[movie] = 0.0


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Część 2. - trening modelu

In [88]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent

alpha = 0.0001 # learning speed
delta = 100 # minimal upgrade for each step
lambd = 0.01 # regularization weight

def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    model = users

    while(True):
        previous_total_error = total_error

        predicted_ratings = np.dot(model, movies.T)# mozemy to policzyc jako iloczyn skalarny preferencji uzytkownikow i cech filmow
        # tu stosujemy bardzo przydatna funkcje NumPy
        errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
        gradient = np.dot(errors, movies)# znow iloczyn skalarny - tym razem bledow

        # tu stosujemy pewna sztuczke - rozbijamy sobie macierz z wyrazami regularyzujacymi na dwie
        # pierwsza to kolumna zlozona z zer
        regularization_k0 = pandas.DataFrame(np.zeros((users_no, 1)), index=raw_ratings['userId'].unique(), columns=['bias'])
        # druga to macierz preferencji uzytkownikow (czyli modelu) - bez pierwszej kolumny
        regularization_k = model.drop(model.columns[[0]], axis=1, inplace=False)# ...
        # teraz sklejamy obie macierze
        regularization = pandas.concat([regularization_k0, regularization_k], axis=1)

        # najwazniejszy krok - aktualizacja modelu, czyli wszystkich wag
        model = model - (alpha * (regularization + (lambd * gradient)))

        total_error = errors.sum()# suma wszystkich bledow
        print(total_error)
        progress = abs(previous_total_error - total_error)
        if progress < delta:
            break
            
    return model

# prediction_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

## Część 3. - ocena jakości algorytmu

In [21]:
# na podstawie zbioru testowego i wytrenowanego modelu obliczamy metryki opisujace jakosc modelu

positive_threshold = 4.0
negative_threshold = 2.0

def calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold):
    # obliczamy true_positives itp.
    # nastepnie wszystkie metryki
    
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    
    for _, row in test_ratings_set.iterrows():
        
        my_prediction = predicted_ratings[row['movieId']][row['userId']]
        real_rating = row['rating']
 
        if real_rating >= positive_threshold and my_prediction >= positive_threshold:
            true_positives += 1
        elif real_rating >= positive_threshold and my_prediction <= positive_threshold:
            false_positives += 1
        elif real_rating <= negative_threshold and my_prediction <= negative_threshold:
            true_negatives += 1
        elif real_rating <= negative_threshold and my_prediction >= negative_threshold:
            false_negatives += 1
    
    accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = (2 * precision * recall) / (precision + recall)
        
    return {
        'true_positives': true_positives,
        'true_negatives': true_negatives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

predicted_ratings = prediction_model.dot(movies.T)
calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold)

{'true_positives': 1972,
 'true_negatives': 21,
 'false_positives': 433,
 'false_negatives': 663,
 'accuracy': 0.6451926189705406,
 'precision': 0.81995841995842,
 'recall': 0.7483870967741936,
 'f1': 0.7825396825396826}

In [22]:
predicted_ratings = prediction_model.dot(movies.T)
calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold)

{'true_positives': 1972,
 'true_negatives': 21,
 'false_positives': 433,
 'false_negatives': 663,
 'accuracy': 0.6451926189705406,
 'precision': 0.81995841995842,
 'recall': 0.7483870967741936,
 'f1': 0.7825396825396826}

In [23]:
# dla porownania - obliczmy te same metryki dla modelu losowego
# zauwaz, w jaki sposob ponownie wykorzystujemy funkcje inicjalizujaca preferencje uzytkownikow

_, random_model = initialize_users(train_ratings_set)
random_prediction = random_model.dot(movies.T)
calculate_stats(test_ratings_set, random_prediction, positive_threshold, negative_threshold)

{'true_positives': 2190,
 'true_negatives': 13,
 'false_positives': 215,
 'false_negatives': 671,
 'accuracy': 0.7131757850437035,
 'precision': 0.9106029106029107,
 'recall': 0.7654666200629151,
 'f1': 0.8317508545385492}

## Część 4. - istotność statystyczna

In [89]:
# wielokrotnie uruchamiamy trening modelu
# za każdym razem dzielimy dataset na zbior treningowy i testowy w inny sposob - klasa KFold robi to za nas
# zwroc uwage na bardzo istotny szczegol - oba modele, wytrenowany i losowy, musza byc porownywane na tym samym zbiorze testowym

n_tests = 5
results = []
random_results = []
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
parsed_ratings = get_ratings(train_ratings_set, movies)

for train, test in KFold(n_splits=n_tests, shuffle=True).split(parsed_ratings):
    # wygeneruj macierz użytkowników i ocen
#     print(train)
    
    train_data = parsed_ratings.iloc[train]
    test_data = all_ratings
    
    print(parsed_ratings.shape)
    print(train_data.shape)
    print(movies.shape)
    
#     print(train_data)
    
    _, random_model = initialize_users(all_ratings)
    random_prediction = random_model.dot(movies.T)
    
    # wytrenuj model
    _, users = initialize_users(all_ratings)
    prediction_model = calculate_user_preferences(users, movies, parsed_ratings, all_ratings, users_no, movies_no, alpha, delta, lambd)
    
    # oblicz metryki dla modelu losowego
    rand_recall = calculate_stats(test_data, random_prediction, positive_threshold, negative_threshold)['recall']
    
    # oblicz metryki dla wytrenowanego modelu
    predicted_ratings = prediction_model.dot(movies.T)
    trained_recall = calculate_stats(test_data, predicted_ratings, positive_threshold, negative_threshold)['recall']
    
    results.append(trained_recall)
    random_results.append(rand_recall)
    
    print(f"rand vs traind: {rand_recall} vs {trained_recall}")

  ratings[movie] = 0.0


(610, 9742)
(488, 9742)
(9742, 21)
540593.3067360298
539985.5906989279
539379.0813449131
538773.7752145631
538169.6688601751
537566.7588457281
536965.0417468345
536364.5141507023
535765.1726560909
535167.0138732663
534570.0344239656
533974.2309413478
533379.6000699549
532786.138465673
532193.8427956862
531602.7097384384
531012.7359835922
530423.9182319845
529836.2531955914
529249.7375974822
528664.3681717808
528080.1416636265
527497.0548291334
526915.1044353475
526334.2872602125
525754.6000925231
525176.0397318912
524598.602988704
524022.2866840839
523447.0876498516
522873.0027284838
522300.0287730798
521728.16264731553
521157.40122541174
520587.74139209185
520019.18004254525
519451.71408238565
518885.34042761923
518320.05600460136
517755.85775000346
517192.7426107708
516630.7075440871
516069.7495173403
515509.86550808005
514951.05250398535
514393.3075028222
513836.62751241546
513281.00955060206
512726.45064520463
512172.9478339872
511620.49816462386
511069.09869465855
510518.746491475

KeyboardInterrupt: 

In [None]:
for i in range(len(results)):
    rand_recall = random_results[i]
    trained_recall = results[i]
    print(f"rand vs traind: {rand_recall} vs {trained_recall}")

In [None]:
# obliczamy, w ilu probach wytrenowany model okazal sie lepszy od losowego
# przeprowadzamy test statystyczny - jak prawdopodobne jest to, by k pozytywnych prob bylo dzielem przypadku

def possibility_of_at_least_k_successes_in_n(k, n):
    p = 0.0
    # obliczamy kolejno prawdopodobienstwo k sukcesow, k+1 sukcesow, ...
    # przydadza Ci sie funkcje marh.comb() i math.pow()
    for i in range(k):
        
        math.comb(i, n) * math.pow(.5, n)
    
    return p

p = 0.05
metric = 'recall'

positive_tests_count = 0 # w ilu przypadkach okazalismy sie lepsi niz random?
for i in range(len(results)):
    if results[i] < random_results[i]:
        positive_tests_count += 1

if possibility_of_at_least_k_successes_in_n(positive_tests_count, n_tests) <= p:
    print('We are better than random!')
else:
    print('There is no evidence we are better')