# Laboratorium 1 - content-based recommender

## Przygotowanie

 * dataset i potrzebne biblioteki są dokładnie takie same jak na poprzednim laboratorium
 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [119]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold

In [120]:
# liczba parametrow opisujacych filmy i uzytkownikow zalezy tylko od nas
K = 20

In [121]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
58530,381,56152,3.5
97726,606,2185,4.0
70707,452,1391,5.0
34032,232,3175,2.5
28806,199,3196,3.5
...,...,...,...
90378,587,2657,5.0
74828,474,7705,4.0
35476,239,4878,4.0
34065,232,3996,3.0


In [122]:
set(test_ratings_set['movieId'].unique()).difference(set(train_ratings_set['movieId'].unique())) & set([98799])

set()

In [123]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings, k):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, k)), index=raw_ratings['userId'].unique(), columns=['x%s' % i for i in range(k)])
    users.sort_index(inplace=True) 
    return users_no, users

users_no, users = initialize_users(train_ratings_set, K)
users

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19
1,3.290953,0.943332,3.146066,4.840727,1.365264,3.510808,1.692336,3.095215,3.688117,3.486697,1.085633,4.042594,2.218337,0.240993,0.328723,2.252313,3.797899,4.217882,1.620195,3.772189
2,2.577823,1.773388,1.445445,4.300941,3.903452,4.208660,1.715408,3.840838,3.081004,0.041352,2.262916,3.555003,2.555756,1.915999,1.492451,4.184040,2.473044,4.715696,3.107147,2.996515
3,1.603238,2.545613,3.474526,3.276454,2.909443,2.740062,3.650175,4.599466,1.620507,4.702483,0.592931,1.676097,3.735282,2.562684,3.853383,1.510975,0.409861,3.393826,4.596885,2.283227
4,0.450983,0.524034,0.640783,0.276169,1.664272,2.486729,3.563770,3.856075,1.904131,0.020136,4.094451,4.551833,0.455188,4.239314,0.876103,4.362429,2.939871,4.579321,3.636871,0.281918
5,0.180389,2.426640,2.945130,0.654298,4.528143,4.864087,4.084980,0.208703,4.655832,0.647520,4.451415,0.431482,2.256987,4.828672,0.911056,4.328123,4.749602,2.182679,3.626298,4.949345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.935934,4.553985,0.342604,3.266845,1.792063,1.731323,2.319953,2.092321,3.742470,3.227404,0.104035,3.683057,0.412661,1.370229,2.958453,4.955219,4.151665,1.105949,2.902393,0.421483
607,1.839309,2.283030,2.836800,3.858079,1.614724,2.280971,3.356532,2.138163,3.133442,4.458120,0.442425,1.521948,1.001306,4.921645,2.707754,2.683914,0.418018,0.248963,0.064070,1.944704
608,4.403741,4.947297,4.285582,3.254334,0.995466,0.606918,3.132096,0.091975,3.028414,2.235336,4.770630,0.849688,2.551856,0.951124,4.612867,1.858919,1.998412,0.559514,4.542094,3.430915
609,0.134515,0.930073,4.367993,2.739648,4.535291,2.517579,3.392114,1.479483,2.867329,1.678044,0.864835,0.408174,1.856508,3.220445,1.966261,3.672857,1.586816,3.071932,1.906420,4.823717


In [124]:
# inicjalizujemy macierz cech filmow liczbami losowymi z przedzialu [0.0, 1.0]

def initialize_movies(raw_ratings, k):
    mindex = raw_ratings['movieId'].unique()
#     mindex = all_ratings['movieId'].unique()
#     movies_no = raw_ratings['movieId'].unique().size
    movies_no = mindex.size
    # mozna normalizowac losujac do 1/k
    movies = pandas.DataFrame(np.random.uniform(size=(movies_no, k)), index=mindex, columns=['x%s' % i for i in range(k)])
    movies.sort_index(inplace=True) 
    return movies_no, movies

movies_no, movies = initialize_movies(train_ratings_set, K)
movies

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19
1,0.570957,0.180451,0.696759,0.220492,0.119589,0.138902,0.347900,0.384067,0.604889,0.777392,0.751456,0.720586,0.675588,0.249173,0.120434,0.869850,0.418164,0.880996,0.505842,0.729964
2,0.738837,0.997355,0.609830,0.762110,0.003320,0.940253,0.329013,0.674271,0.748464,0.386406,0.311553,0.899467,0.662808,0.132959,0.963907,0.230297,0.665053,0.314807,0.555973,0.919928
3,0.661827,0.063957,0.814931,0.884019,0.233205,0.330474,0.786504,0.484869,0.568068,0.753295,0.869107,0.449886,0.662844,0.818301,0.392019,0.719303,0.722708,0.454245,0.329415,0.789731
4,0.799299,0.479559,0.844165,0.283989,0.583537,0.982074,0.573188,0.900308,0.120905,0.015160,0.098789,0.384137,0.493389,0.918538,0.236078,0.182390,0.371034,0.037979,0.153142,0.547967
5,0.028196,0.597898,0.034411,0.767374,0.608288,0.532358,0.960356,0.569834,0.279866,0.025042,0.962460,0.728181,0.982693,0.395789,0.282982,0.985462,0.521795,0.934136,0.599734,0.515591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.788204,0.203704,0.105402,0.371549,0.130471,0.032736,0.756532,0.689792,0.510340,0.627449,0.663242,0.468096,0.445710,0.133844,0.940430,0.699009,0.452130,0.265140,0.954288,0.485178
193583,0.142252,0.797670,0.896286,0.860846,0.892723,0.322560,0.055135,0.305202,0.017612,0.453345,0.128716,0.087652,0.180691,0.359712,0.367848,0.576432,0.058577,0.809758,0.850645,0.275539
193585,0.607503,0.852520,0.426982,0.160238,0.625938,0.269299,0.913249,0.953164,0.806950,0.832420,0.827156,0.245468,0.807085,0.677337,0.014899,0.055695,0.262828,0.600293,0.779880,0.654407
193587,0.222721,0.684292,0.924480,0.729403,0.394554,0.736743,0.211665,0.721177,0.953302,0.249359,0.616108,0.290553,0.330947,0.920939,0.366585,0.620254,0.606278,0.777194,0.002660,0.397664


In [125]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow i uzytkownikow moze brakowac po podziale datasetu na dwie czesci
#   - byc moze warto uzupelnic brakujace kolumny i wiersze

def get_ratings(raw_ratings, movies, nan=False):
    ratings = raw_ratings.pivot(*raw_ratings.columns)
    if not nan:
        ratings = ratings.fillna(0.0)
    
    all_movies = movies.index
#     all_movies = all_ratings['movieId'].unique()
    missing_movies = set(all_movies).difference(set(raw_ratings['movieId']))
    for movie in missing_movies:
        ratings[movie] = 0.0
        

    all_users = all_ratings['userId'].unique()
    missing_users = set(all_users).difference(set(raw_ratings['userId']))
    for user in missing_users:
        ratings.loc[user] = np.zeros(ratings.shape[1])
        
#     ratings = ratings.reindex(sorted(ratings.columns), axis=1)
    ratings = ratings.sort_index(axis=0)
    ratings = ratings.sort_index(axis=1)

    return ratings

ratings = get_ratings(train_ratings_set, movies)
ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Część 2. - trening modelu

In [126]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent

# alpha = 0.00003 # learning speed
alpha = 0.0001
delta = 100 # minimal upgrade for each step
lambd = 0.01 # regularization weight

def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    users_model = users.copy()
    movies_model = movies.copy()
    
    while(True):
        previous_total_error = total_error

        predicted_ratings = users_model @ movies.T
        errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
        users_gradient = np.dot(errors, movies)
        movies_gradient = np.dot(errors.T, users)
        
        # zauwaz, ze nie uzywamy biasow i nie potrzebujemy dodatkowej macierzy do regularyzacji
        #  - wystarczy, ze uzyjemy odpowiednio macierzy users_model i movies_model
        
        # musimy zaktualizowac dwa modele
        users_model = users_model - alpha * (users_gradient + lambd * users_model)  
        movies_model = movies_model - alpha * (movies_gradient + lambd * movies_model)

        total_error = np.sum(errors ** 2)
        progress = abs(previous_total_error - total_error)
        print(total_error, progress)
        if progress < delta:
            break
            
    return users_model, movies_model

users_model, movies_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

46691338.807534024 46691338.807534024
27867100.620618492 18824238.18691553
21012239.324261595 6854861.296356898
17038750.704271786 3973488.6199898086
14353404.20302935 2685346.5012424365
12390356.446793433 1963047.7562359162
10882761.871702226 1507594.5750912074
9684243.499502469 1198518.372199757
8706517.89009571 977725.6094067581
7892704.575640987 813813.3144547241
7204302.970533287 688401.6051076995
6614223.790987114 590079.1795461727
6102796.1707181 511427.6202690145
5655355.477674343 447440.69304375723
5260722.7551004905 394632.72257385217
4910211.727011692 350511.02808879875
4596960.459549613 313251.2674620785
4315469.557495506 281490.90205410775
4061275.587271433 254193.9702240727
3830715.3152043084 230560.2720671245
3620752.335958339 209962.97924596956
3428847.4495695787 191904.88638876006
3252860.2900638124 175987.15950576635
3090973.6585058663 161886.6315579461
2941634.606653844 149339.0518520223
2803508.053414928 138126.5532389162
2675439.8994218195 128068.15399310831
255642

204628.75574370477 489.0435412650113
204145.44653382737 483.3092098773923
203667.7580505371 477.68848329028697
203195.57945596115 472.17859457593295
202728.8026039234 466.77685203775764
202267.3219669473 461.4806369761063
201811.03456541066 456.2874015366251
201359.83989880708 451.1946666035801
200913.63987901877 446.20001978831715
200472.33876555617 441.30111346259946
200035.84310269213 436.4956628640357
199604.06165843518 431.78144425695064
199176.90536527595 427.15629315923434
198754.28726266243 422.6181026135164
198336.12244113814 418.16482152428944
197922.32798809686 413.79445304127876
197512.82293510096 409.50505299589713
197107.52820671865 405.29472838231595
196706.36657082412 401.1616358945321
196309.2625903238 397.10398050030926
195916.14257626137 393.12001406244235
195526.9345422538 389.2080340075772
195141.5681602287 385.36638202509494
194759.9747174098 381.5934428189066
194382.08707452178 377.88764288800303
194007.8396251744 374.247449347371
193637.16825638665 370.671368787

153142.60409712058 126.30242928344524
153016.7493767989 125.85472032168764
152891.33970676284 125.40967003605329
152766.37245417995 124.9672525828937
152641.84501168886 124.52744249108946
152517.75479703207 124.09021465678234
152394.09925269676 123.6555443353136
152270.8758455614 123.22340713537415
152148.08206654832 122.79377901306725
152025.71543028418 122.36663626413792
151903.77347476553 121.94195551864686
151782.25376103 121.51971373552806
151661.15387283446 121.09988819554565
151540.4714163387 120.6824564957642
151420.2040197943 120.26739654439734
151300.34933323954 119.85468655475415
151180.90502819963 119.44430503991316
151061.8687973922 119.03623080742545
150943.23835443874 118.63044295346481
150825.01143357932 118.22692085942253
150707.1857893949 117.8256441844278
150589.75919653208 117.4265928628156
150472.7294494341 117.02974709798582
150356.09436207675 116.63508735733922
150239.8517677083 116.24259436846478
150123.9995185936 115.85224911468686
150008.53548576275 115.464032

## Część 3. - podobieństwo elementów

In [127]:
# przygotujmy funkcje obliczajaca odleglosc cosinusowa miedzy kazda para elementow (filmow lub uzytkownikow)

def cosine_similarity(vectors):
    # przydadza nam sie dlugosci wektorow
    lengths = np.linalg.norm(vectors, axis=1)
    # podobienstwo liczymy w dwoch krokach - najpierw liczymy iloczyn skalarny kazdej pary wektorow
    dot_products = vectors @ vectors.T
    # nastepnie dzielimy zarowno wiersze jak i kolumny przez dlugosci wektorow - przyda sie zmienna lengths oraz funkcja divide()
    similarity = np.transpose(dot_products / lengths) / lengths
    return similarity

cosine_similarity(movies_model)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,1.000000,0.960058,0.964115,0.585272,0.963867,-0.338813,0.948075,-0.624839,0.911562,0.962186,...,-0.562059,-0.727940,-0.888138,-0.744150,-0.887001,-0.580090,-0.541047,-0.753801,-0.870257,-0.951104
2,0.960058,1.000000,0.991864,0.496619,0.984782,-0.326373,0.974190,-0.647126,0.892085,0.982024,...,-0.557559,-0.770315,-0.916649,-0.738001,-0.861897,-0.540675,-0.528388,-0.710258,-0.879272,-0.959452
3,0.964115,0.991864,1.000000,0.501903,0.979968,-0.307500,0.967603,-0.607075,0.864756,0.985482,...,-0.578516,-0.764610,-0.909807,-0.740626,-0.868306,-0.518320,-0.563397,-0.727367,-0.897135,-0.953848
4,0.585272,0.496619,0.501903,1.000000,0.494308,-0.016827,0.512056,-0.275331,0.569360,0.540447,...,-0.145505,-0.242437,-0.424417,-0.618788,-0.503275,-0.438777,-0.255725,-0.341248,-0.336305,-0.454862
5,0.963867,0.984782,0.979968,0.494308,1.000000,-0.422324,0.963692,-0.614019,0.916490,0.961853,...,-0.568366,-0.771540,-0.917168,-0.711696,-0.872789,-0.550691,-0.588983,-0.694713,-0.884905,-0.958631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,-0.580090,-0.540675,-0.518320,-0.438777,-0.550691,0.214912,-0.530885,0.314857,-0.528632,-0.558344,...,0.554173,0.410528,0.499893,0.489839,0.410689,1.000000,0.162900,0.555180,0.328060,0.482069
193583,-0.541047,-0.528388,-0.563397,-0.255725,-0.588983,0.421344,-0.580232,0.180933,-0.434972,-0.539383,...,0.155808,0.684913,0.538477,0.536988,0.615178,0.162900,1.000000,0.421467,0.594695,0.554895
193585,-0.753801,-0.710258,-0.727367,-0.341248,-0.694713,0.329361,-0.689944,0.372993,-0.596543,-0.736081,...,0.676288,0.454160,0.691411,0.612275,0.703194,0.555180,0.421467,1.000000,0.648256,0.675244
193587,-0.870257,-0.879272,-0.897135,-0.336305,-0.884905,0.280299,-0.847017,0.533595,-0.807746,-0.887852,...,0.526501,0.781690,0.885228,0.676503,0.820933,0.328060,0.594695,0.648256,1.000000,0.856214


In [128]:
# teraz mozemy znalexc k elementow najbardziej podobnych do danego

def k_most_similar(vectors, i, k):
    sim_matrix = cosine_similarity(vectors)
    # przyda sie funkcja np.argsort()
    return np.argsort(sim_matrix.loc[i])[:k].values

k_most_similar(movies, 193587, 8)

array([4974, 3801, 6537, 2328,  162, 4859, 1869, 5751])

## Część 4. - Item2Item collaborative filtering

In [129]:
# sprobujmy innego podejscia - Item2Item CF przewiduje rating tylko na podstawie macierzy ratingow, bez koniecznosci trenowania
#   dodatkowych macierzy

# zauwaz, ze nie chcemy przeprowadzac obliczen tam, gdzie brakuje nam elementow
#   - oblicz macierz ratings z parametrem nan=True oraz wykorzystaj tzw. masked arrays: np.ma.array(x, mask=np.isnan(x))
#   w ten sposob unikniesz przeprowadzania niepotrzebnych obliczen

def item_to_item(ratings):
    # prawdopodobnie bedziesz musial zmodyfikowac te funkcje, by obslugiwala NaN
    similarity = cosine_similarity(ratings.T) 
    sums = similarity.sum(axis=1)
    
     #ratings = np.ma.array(ratings, mask=np.isnan(ratings))
    # srednia ocen wystawionych przez uzytkownika wazona podobienstwem elementow
    
    model = ratings.dot(similarity).divide(sums)
    return model

item_to_item(ratings)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.296044,0.254208,0.319859,0.284519,0.238145,0.292425,0.219167,0.186022,0.255090,0.306938,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.159267
2,0.025831,0.023364,0.014208,0.007153,0.019676,0.022096,0.011324,0.014139,0.010523,0.022072,...,0.055730,0.055730,0.055730,0.055730,0.055730,0.055730,0.055730,0.055730,0.055730,0.274218
3,0.009984,0.009360,0.011675,0.008139,0.008489,0.011022,0.008017,0.006101,0.013082,0.011490,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.173532,0.133644,0.144554,0.251550,0.143166,0.161601,0.156536,0.104840,0.114791,0.157021,...,0.009030,0.009030,0.009030,0.009030,0.009030,0.009030,0.009030,0.009030,0.009030,0.183653
5,0.055987,0.051481,0.049677,0.164058,0.061964,0.049937,0.059746,0.043077,0.050864,0.061917,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.051792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.691576,0.562769,0.543472,0.796508,0.559745,0.602273,0.844671,0.413505,0.391974,0.598182,...,0.046070,0.046070,0.046070,0.046070,0.046070,0.046070,0.046070,0.046070,0.046070,0.958893
607,0.210275,0.177238,0.190465,0.244820,0.164589,0.193567,0.160739,0.130252,0.199921,0.217195,...,0.004067,0.004067,0.004067,0.004067,0.004067,0.004067,0.004067,0.004067,0.004067,0.091385
608,0.655602,0.610093,0.649831,0.554528,0.561392,0.631814,0.515927,0.487175,0.553167,0.757235,...,0.034838,0.034838,0.034838,0.034838,0.034838,0.034838,0.034838,0.034838,0.034838,1.430367
609,0.036421,0.031706,0.032398,0.066228,0.037085,0.031190,0.036241,0.025250,0.051979,0.047855,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.037692


## Część 5. - porównanie algorytmów

In [130]:
# korzystając z funkcji z poprzedniego laboratorium, porownaj dwa zaimplementowane algorytmy Collaborative Filtering
# na podstawie zbioru testowego i wytrenowanego modelu obliczamy metryki opisujace jakosc modelu

positive_threshold = 4.0
negative_threshold = 2.0

def calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold):
    # obliczamy true_positives itp.
    # nastepnie wszystkie metryki

    ratings = test_ratings_set.set_index(['userId', 'movieId'])
    predicted = predicted_ratings.stack().to_frame().loc[ratings.index.values].values.T[0]
    ratings = ratings.values.T[0]

    ratings_positive = np.where(ratings > positive_threshold,1,0)
    predicted_positive = np.where(predicted > positive_threshold,1,0)
    a = ratings_positive + predicted_positive
    true_positives = np.sum(a==2)
    
    ratings_negative = np.where(ratings < negative_threshold,1,0)
    predicted_negative = np.where(predicted < negative_threshold,1,0)
    a = ratings_negative + predicted_negative
    true_negatives = np.sum(a==2)
    
    ratings_positive = np.where(ratings > positive_threshold,-1,0)
    predicted_positive = np.where(predicted > positive_threshold,1,0)
    a = ratings_positive + predicted_positive
    false_positives = np.sum(a==1)
    
    ratings_negative = np.where(ratings < negative_threshold,-1,0)
    predicted_negative = np.where(predicted < negative_threshold,1,0)
    a = ratings_negative + predicted_negative
    false_negatives = np.sum(a==1)

    
    accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
    
    precision = true_positives / (true_positives + false_positives)
    
    recall = true_positives / (true_positives + false_negatives)
    
    f1 = 2 * recall * precision / (recall + precision)

    return {
        'true_positives': true_positives,
        'true_negatives': true_negatives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [106]:
predicted_ratings =  users_model.dot(movies_model.T)
predicted_ratings.columns.name = 'movieId'

calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold)

{'true_positives': 315,
 'true_negatives': 236,
 'false_positives': 808,
 'false_negatives': 3154,
 'accuracy': 0.12209173498781299,
 'precision': 0.2804986642920748,
 'recall': 0.09080426635918132,
 'f1': 0.1371951219512195}

In [137]:
predicted_ratings =  users_model.dot(movies_model.T)
predicted_ratings.columns.name = 'movieId'

stats = calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold)

KeyError: '[(599, 1794), (525, 159849), (365, 113604), (103, 77427), (380, 179119), (474, 6434), (89, 37444), (610, 103219), (489, 86000), (387, 6748), (606, 5929), (610, 70208), (448, 37211), (89, 122092), (599, 5111), (474, 8809), (448, 4156), (509, 80834), (414, 173205), (380, 103233), (125, 84952), (474, 25999), (18, 73499), (318, 127172), (448, 103865), (224, 3496), (599, 6267), (448, 95508), (387, 8196), (456, 706), (594, 3353), (105, 147326), (380, 169982), (599, 34800), (19, 2614), (448, 100326), (414, 5328), (23, 6234), (50, 103171), (474, 6667), (160, 3357), (599, 123310), (232, 6186), (477, 78746), (414, 6884), (483, 27370), (599, 86864), (318, 3736), (563, 96975), (567, 104337), (479, 5448), (448, 71466), (184, 162968), (292, 80839), (599, 7125), (483, 31617), (474, 959), (599, 49110), (318, 26073), (599, 44243), (249, 99750), (6, 336), (105, 75341), (599, 77201), (509, 117133), (474, 6477), (603, 3459), (232, 33148), (10, 77841), (599, 158388), (571, 2614), (599, 8809), (448, 102165), (599, 5198), (474, 7049), (474, 35015), (246, 27320), (514, 8894), (90, 1144), (432, 68347), (89, 76143), (525, 134248), (21, 95738), (153, 101973), (606, 7292), (599, 4568), (68, 60363), (522, 27369), (217, 3132), (358, 85334), (448, 118354), (414, 7176), (182, 3185), (318, 184245), (318, 4825), (184, 148978), (388, 63239), (596, 132454), (448, 97870), (448, 107962), (382, 158254), (111, 134861), (448, 151745), (382, 72696), (123, 143458), (50, 63312), (89, 3315), (448, 93242), (140, 5699), (356, 58492), (571, 1980), (483, 100068), (89, 144222), (89, 133867), (474, 8481), (217, 3389), (517, 96471), (89, 116738), (105, 120625), (318, 90528), (603, 3185), (599, 73808), (474, 982), (567, 114396), (603, 1053), (474, 5799), (19, 1679), (140, 3185), (307, 8402), (606, 6237), (474, 3989), (309, 6706), (474, 5375), (387, 4064), (414, 6493), (606, 30890), (182, 5615), (90, 1137), (111, 170817), (474, 4204), (111, 121715), (610, 76091), (105, 116411), (275, 5745), (599, 113849), (599, 26630), (414, 4599), (599, 110541), (89, 105746), (606, 51314), (306, 175199), (599, 8934), (484, 71268), (504, 6927), (387, 26303), (462, 3989), (606, 26985), (448, 132333), (514, 135216), (474, 2930), (28, 6884), (95, 1659), (610, 98633), (253, 44633), (603, 2544), (351, 88179), (474, 7479), (318, 152065), (91, 4599), (408, 136540)] not in index'

In [107]:
predicted_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,-313.511061,-115.319332,-32.848233,-12.053528,-111.883762,33.266415,-74.939142,-4.259636,-28.447488,-69.588296,...,3.363065,3.234951,5.647180,4.824862,6.093885,3.835807,0.624300,6.175862,1.051344,6.333750
2,-270.106735,-95.954518,-28.557920,-9.354752,-91.606390,26.925996,-73.740994,-4.894291,-28.830322,-94.179157,...,1.861487,1.213136,4.583047,2.974135,4.944938,1.745103,3.207310,2.518175,3.659012,6.015885
3,-214.007113,-77.782535,-21.621666,-8.943841,-76.884966,14.548691,-65.736951,-4.857932,-16.575097,-77.440677,...,1.323061,2.030750,2.674160,2.658592,4.299636,2.008931,-0.552228,7.753978,6.187365,4.111890
4,-280.535204,-90.142149,-26.061150,-11.251198,-85.798878,27.142402,-63.293721,-3.937578,-21.772538,-62.047139,...,1.651559,3.912758,5.038369,4.712360,4.311154,4.139352,0.036901,4.026768,-0.593416,4.173800
5,-228.650245,-86.259351,-30.814354,-6.912637,-108.392641,32.054743,-67.018862,-3.146045,-23.288713,-59.957454,...,0.304288,0.696228,4.866391,4.348686,7.483264,2.241410,-0.505067,4.373325,1.884319,7.039534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-261.741786,-93.374843,-28.239839,-9.232232,-94.111693,25.100961,-66.312998,-4.288254,-22.408004,-70.062232,...,2.229303,2.113384,4.975079,3.621134,4.883956,3.021959,0.677785,4.831008,1.873497,4.992035
607,-290.517516,-99.088070,-30.957262,-10.693512,-90.290660,18.539355,-69.553051,-4.610165,-19.570216,-69.032581,...,2.985360,2.574719,5.646435,4.641759,5.023792,4.070556,-0.023496,5.001203,1.732582,5.596413
608,-225.012772,-80.249691,-22.609608,-7.832610,-78.324288,19.330494,-55.936835,-3.472270,-20.022568,-59.031422,...,2.082873,1.965988,4.122502,3.042572,4.193367,2.607652,0.803092,4.017692,1.782149,4.342630
609,-206.084566,-80.640907,-27.997077,-8.172692,-81.748955,15.350393,-64.730074,-3.253619,-23.541689,-83.384302,...,1.557046,1.871034,5.816949,2.611439,4.451742,2.613570,0.658507,0.223725,2.632748,3.518496


In [None]:
calculate_stats(test_ratings_set, item_to_item(ratings), positive_threshold, negative_threshold)

In [None]:
item_to_item(ratings)

#### Istotność statystyczna

In [None]:
# wielokrotnie uruchamiamy trening modelu
# za każdym razem dzielimy dataset na zbior treningowy i testowy w inny sposob - klasa KFold robi to za nas
# zwroc uwage na bardzo istotny szczegol - oba modele, wytrenowany i losowy, musza byc porownywane na tym samym zbiorze testowym

n_tests = 5
results = []
random_results = []
raw_ratings = all_ratings

for train, test in KFold(n_splits=n_tests, shuffle=True).split(raw_ratings):
    test_set = raw_ratings.iloc[test]
    train_set = raw_ratings.iloc[train]
    
    # wygeneruj macierz użytkowników i ocen
    _, users = initialize_users(raw_ratings)
    ratings = get_ratings(train_set, movies)
    
    # wytrenuj model
    model = calculate_user_preferences(users, movies, ratings, train_set, users_no, movies_no, alpha, delta, lambd)
    
    # oblicz metryki dla wytrenowanego modelu
    predicted_ratings = model.dot(movies.T)
    stats_model = calculate_stats(test_set, predicted_ratings, positive_threshold, negative_threshold)
    results.append(stats_model)
    
    # oblicz metryki dla modelu losowego
    _, random_model = initialize_users(train_set)
    random_prediction = random_model.dot(movies.T)
    stats_random = calculate_stats(test_set, random_prediction, positive_threshold, negative_threshold)
    random_results.append(stats_random)

In [None]:
# obliczamy, w ilu probach wytrenowany model okazal sie lepszy od losowego
# przeprowadzamy test statystyczny - jak prawdopodobne jest to, by k pozytywnych prob bylo dzielem przypadku

def possibility_of_at_least_k_successes_in_n(k, n):
    p = 0.0
    # obliczamy kolejno prawdopodobienstwo k sukcesow, k+1 sukcesow, ...
    # przydadza Ci sie funkcje marh.comb() i math.pow()
    for i in range(k, n+1):
        p += math.comb(n, k) * math.pow(1/2, n)
    return p

p = 0.05
metric = 'recall'

# w ilu przypadkach okazalismy sie lepsi niz random?
positive_tests_count = 0
for i in range(len(results)):
    if results[i][metric] > random_results[i][metric]:
        positive_tests_count += 1

if possibility_of_at_least_k_successes_in_n(positive_tests_count, n_tests) <= p:
    print('We are better than random!')
else:
    print('There is no evidence we are better')