# Laboratorium 1 - content-based recommender

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold

In [2]:
# tworzymy reprezentacje filmow jako wektorow cech - na podstawie gatunkow

genres = [
    '(no genres listed)', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'IMAX', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War', 
    'Western'
]
genres_no = len(genres)

movies = pandas.read_csv('ml-latest-small/movies.csv')
movies_no = movies.shape[0]

movies['bias'] = 1.0
for genre in genres:
    movies[genre] = np.where(movies['genres'].str.contains(genre, regex=False), 1.0, 0.0)
    
movies = movies.drop(columns=['title', 'genres']).set_index('movieId')
movies

Unnamed: 0_level_0,bias,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
50837,328,6333,0.5
31380,217,2779,3.0
47592,307,45447,2.0
90300,587,1073,4.0
31011,217,720,4.0
...,...,...,...
73380,474,1090,3.5
69750,448,55765,4.0
13153,84,349,4.0
55731,368,2808,2.0


In [4]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, genres_no+1)), index=raw_ratings['userId'].unique(), columns=['bias']+genres)
    return users_no, users

users_no, users = initialize_users(train_ratings_set)
users

Unnamed: 0,bias,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
328,1.928594,3.715336,3.246022,1.373340,4.894362,4.688525,3.631357,3.063630,2.948849,2.213758,...,0.586317,0.118343,3.016288,3.539836,0.716375,3.082308,1.870993,4.546908,2.031250,4.474080
217,3.675874,2.629590,1.182415,0.263085,3.491432,1.910120,0.714800,1.907360,2.825630,2.721010,...,4.253374,0.110264,3.610657,2.144217,1.029751,1.483393,1.306323,2.343849,0.259179,1.940810
307,2.181335,0.754096,0.387734,4.865723,1.788651,3.600205,0.288105,0.015700,0.180534,4.620366,...,4.626784,4.704844,1.665424,2.019667,1.778946,3.509620,1.268751,0.756263,0.461027,1.203636
587,2.199712,0.810631,3.241832,4.205916,2.954878,4.933252,3.203908,1.287928,0.133542,4.487915,...,2.732024,0.779223,4.911185,3.627061,4.161955,0.877388,3.823120,2.698626,4.218126,2.175938
232,4.233571,3.259668,0.407624,4.772791,4.072376,3.150514,1.788540,0.902469,0.076524,0.500737,...,1.900583,0.059135,4.847387,4.128055,1.716229,1.224978,1.032791,0.060544,2.595523,3.038712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,0.218258,1.615999,1.370720,0.011506,0.792652,3.704131,4.253806,0.974801,4.739947,0.679839,...,4.682728,4.451673,0.147901,4.804125,3.221174,2.586804,1.293425,0.344875,0.843394,4.542821
349,2.457987,0.560781,2.116403,4.253111,4.952141,1.663514,4.429848,2.078257,2.284921,3.694557,...,3.328661,2.482384,2.322140,0.320968,2.570446,4.138412,0.526160,2.699083,0.455521,3.069014
231,0.428071,2.186596,0.241234,2.004871,1.958925,3.944566,1.567846,3.618150,3.253661,2.535257,...,4.537302,2.931261,2.419759,0.437041,4.070421,2.951259,4.776319,2.723667,2.316952,0.279333
236,4.679369,3.556160,1.729350,2.336546,4.723732,0.082793,2.040232,3.133864,3.321700,1.050686,...,3.923766,2.734374,4.101570,1.261348,1.621314,4.533727,0.296998,0.285930,4.556304,2.417510


In [5]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow moze brakowac po podziale datasetu na dwie czesci - musimy uzueplnic brakujace kolumny

def get_ratings(raw_ratings, movies):
    ratings = raw_ratings.pivot(*raw_ratings.columns).fillna(0.0)
    missing_movies = set(movies.index).difference(set(raw_ratings['movieId']))
    for movie in missing_movies:
        ratings[movie] = 0.0
    ratings = ratings.reindex(sorted(ratings.columns), axis=1)
    return ratings

ratings = get_ratings(train_ratings_set, movies)
ratings

  ratings[movie] = 0.0


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Część 2. - trening modelu

In [102]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent

alpha = 0.0001 # learning speed
delta = 5000 # minimal upgrade for each step
lambd = 0.01 # regularization weight

def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    model = users
    counter = 0

    while(True):
        previous_total_error = total_error

        # mozemy to policzyc jako iloczyn skalarny preferencji uzytkownikow i cech filmow
        predicted_ratings = model.dot(movies.transpose())
#         predicted_ratings = movies.transpose() @ users 
        # tu stosujemy bardzo przydatna funkcje NumPy
        errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
        # znow iloczyn skalarny - tym razem bledow
        gradient = errors.dot(movies)

        # tu stosujemy pewna sztuczke - rozbijamy sobie macierz z wyrazami regularyzujacymi na dwie
#         pierwsza to kolumna zlozona z zer
        regularization_k0 = pandas.DataFrame(np.zeros((users_no, 1)), index=raw_ratings['userId'].unique(), columns=['bias'])
        # druga to macierz preferencji uzytkownikow (czyli modelu) - bez pierwszej kolumny
        regularization_k = model.iloc[:,1:]
        
        # teraz sklejamy obie macierze
        regularization = pandas.concat([regularization_k0, regularization_k], axis=1)

#         # najwazniejszy krok - aktualizacja modelu, czyli wszystkich wag
        model = model - alpha * (gradient + lambd * regularization)

    # suma wszystkich bledow
    
        total_error = errors.sum() # suma elementow macierzy errors
        if math.isnan(total_error):
            break
        print("Total error = " + str(total_error))
        progress = abs(previous_total_error - total_error)
        print("Progress = " + str(progress))
        print("\n\n\n\n")
        counter += 1
        if progress < delta or counter > 100:
            break
            
    return model

# prediction_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

## Część 3. - ocena jakości algorytmu

In [None]:
# https://stackoverflow.com/questions/16729574/how-to-get-a-value-from-a-cell-of-a-dataframe

In [58]:
def prepare_for_confusion_matrix(prediction_model, movies, test_ratings_set):
    predicted_ratings = prediction_model.dot(movies.T)
    predicted_ratings = predicted_ratings.astype(int)
    predicted_ratings = predicted_ratings.sort_index(0)

    y_actual = []
    y_predicted = []

    for index, row in test_ratings_set.iterrows():
        y_actual.append(row['rating'])
        y_predicted.append(predicted_ratings.at[int(row['userId']),int(row['movieId'])])

    return y_actual, y_predicted


y_actual, y_predicted = prepare_for_confusion_matrix(prediction_model, movies, test_ratings_set)


  predicted_ratings = predicted_ratings.sort_index(0)


In [None]:
# https://www.codegrepper.com/code-examples/python/sklearn+knn+example+confusion+matrix

In [None]:
# https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal

In [None]:
# https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co

In [76]:
# na podstawie zbioru testowego i wytrenowanego modelu obliczamy metryki opisujace jakosc modelu
## sklearn.confusion_matrix()

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

positive_threshold = 4.0
negative_threshold = 2.0

# def calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold):
def calculate_stats(y_actual, y_predicted, positive_threshold, negative_threshold):
    # obliczamy true_positives itp.
    # nastepnie wszystkie metryki
    
    y_actual_labels = []
    y_predicted_labels = []
    
    for i in range(0, len(y_actual)):
        if y_actual[i] > int(positive_threshold):
            label = 'Positive'
        elif y_actual[i] <= int(negative_threshold):
            label = 'Negative'
        else:
            label = 'None'
        y_actual_labels.append(label)
    
    for i in range(0, len(y_predicted)):
        if y_predicted[i] > positive_threshold:
            label = 'Positive'
        elif y_predicted[i] < negative_threshold:
            label = 'Negative'
        else:
            label = 'None'
        y_predicted_labels.append(label)
    
    conf_m = confusion_matrix(y_actual_labels, y_predicted_labels)
    
    false_positives = conf_m.sum(axis=0) - np.diag(conf_m)  
    false_negatives = conf_m.sum(axis=1) - np.diag(conf_m)
    true_positives = np.diag(conf_m)
    true_negatives = conf_m.sum() - (false_positives + false_negatives + true_positives)
    
    # fp, ... -> suma elementow w tablicy
    false_positives = false_positives.sum()
    false_negatives = false_negatives.sum()
    true_positives = true_positives.sum()
    true_negatives = true_negatives.sum()

#     print(precision_score(y_actual_labels, y_predicted_labels, average='micro')) # liczy recall, wynik taki sam jak ponizej

        
    recall = true_positives / (true_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    f1 =  2 * ((precision * recall) / (precision + recall))
    accuracy = (true_positives + true_negatives) / (true_positives + false_positives + false_negatives + true_negatives)

    return {
        'true_positives': true_positives,
        'true_negatives': true_negatives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [94]:
predicted_ratings = prediction_model.dot(movies.T)
calculate_stats(y_actual, y_predicted, positive_threshold, negative_threshold)

{'true_positives': 855,
 'true_negatives': 5897,
 'false_positives': 4187,
 'false_negatives': 4187,
 'accuracy': 0.4463837101679228,
 'precision': 0.16957556525188416,
 'recall': 0.16957556525188416,
 'f1': 0.16957556525188416}

In [103]:
# dla porownania - obliczmy te same metryki dla modelu losowego
# zauwaz, w jaki sposob ponownie wykorzystujemy funkcje inicjalizujaca preferencje uzytkownikow

# tamten kod opakowac w funkcje zeby zroic tez dla random_prediction

_, random_model = initialize_users(train_ratings_set)
random_prediction = random_model.dot(movies.T)
print(random_prediction)
print("\n\n\n\n")
print(predicted_ratings)







# y_actual, y_predicted = prepare_for_confusion_matrix(prediction_model, movies, test_ratings_set)
y_actual_random, y_predicted_random = prepare_for_confusion_matrix(random_model, movies, train_ratings_set)
calculate_stats(y_actual_random, y_predicted_random, positive_threshold, negative_threshold)

movieId     1          2          3          4         5          6       \
328      14.838169  10.174971   7.763807   9.159185  5.441146   9.958419   
217      17.964539  10.001678   8.463022   9.920574  4.352771  10.455046   
307      18.907616  13.427281   8.319495  11.187184  6.610162  15.900987   
587      18.883713  12.055138   9.755775  11.543236  6.510910   9.121296   
232      16.042551   9.505536   8.333661   8.909144  5.074630   6.144064   
..             ...        ...        ...        ...       ...        ...   
30       12.064805   8.815115   4.790686   7.528430  3.979717   7.753324   
349       8.976193   6.921582   2.876998   3.524866  1.042077  10.266140   
231      12.372487   9.099396  11.058361  11.525213  6.212828  16.072330   
236      13.349884   6.821510   5.179362   7.939620  2.866006   5.738990   
441      19.275153  12.258522  10.366192  13.163845  5.827338  14.125980   

movieId     7          8         9          10      ...     193565     193567  \
328   

  predicted_ratings = predicted_ratings.sort_index(0)


{'true_positives': 26836,
 'true_negatives': 122630,
 'false_positives': 68958,
 'false_negatives': 68958,
 'accuracy': 0.5200952042925444,
 'precision': 0.2801428064388166,
 'recall': 0.2801428064388166,
 'f1': 0.2801428064388166}

## Część 4. - istotność statystyczna

In [104]:
# wielokrotnie uruchamiamy trening modelu
# za każdym razem dzielimy dataset na zbior treningowy i testowy w inny sposob - klasa KFold robi to za nas
# zwroc uwage na bardzo istotny szczegol - oba modele, wytrenowany i losowy, musza byc porownywane na tym samym zbiorze testowym

n_tests = 5
positive_tests_count = 0
results = []
random_results = []
alpha = 0.01 # learning speed
delta = 500 # minimal upgrade for each step
lambd = 0.1 # regularization weight

# for test, train in KFold(n_splits=n_tests, shuffle=True).split(raw_ratings):
for train, test in KFold(n_splits=n_tests, shuffle=True).split(train_ratings_set):
    # wygeneruj macierz użytkowników i ocen
    # wytrenuj model
    # oblicz metryki dla wytrenowanego modelu
    # oblicz metryki dla modelu losowego
    users_no, users = initialize_users(train_ratings_set)
    ratings = get_ratings(train_ratings_set, movies)
#     ratings = get_ratings(raw_ratings, movies)
    model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)
    y_actual, y_predicted = prepare_for_confusion_matrix(model, movies, test_ratings_set)
    y_actual_random, y_predicted_random = prepare_for_confusion_matrix(random_model, movies, test_ratings_set)
    stats = calculate_stats(y_actual, y_predicted, positive_threshold, negative_threshold)
    random_stats = calculate_stats(y_actual_random, y_predicted_random, positive_threshold, negative_threshold)
    if stats['recall'] > random_stats['recall']:
        positive_tests_count += 1
    print("Recall for trained: " + str(stats['recall']))
    print("Recall for random: " + str(random_stats['recall']))
    print("\n\n\n\n")
    

  ratings[movie] = 0.0


Total error = 546530.5290979043
Progress = 546530.5290979043





Total error = -983215.3944474477
Progress = 1529745.9235453522





Total error = 753829.6663366457
Progress = 1737045.0607840936





Total error = -3154258.7655962682
Progress = 3908088.431932914





Total error = -3300877.0593739566
Progress = 146618.2937776884





Total error = -20590007.96925323
Progress = 17289130.909879275





Total error = -89119779.71785538
Progress = 68529771.74860215





Total error = -272558316.5883845
Progress = 183438536.87052912





Total error = -635061904.3959665
Progress = 362503587.807582





Total error = -1184012247.5900643
Progress = 548950343.1940978





Total error = -1533345955.5741768
Progress = 349333707.9841125





Total error = -366412630.0167632
Progress = 1166933325.5574136





Total error = 3557519548.0759654
Progress = 3923932178.0927286





Total error = 802648493.1327364
Progress = 2754871054.9432287





Total error = -70572506478.75934
Progress = 71375154971

  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.1309004363347878
Recall for random: 0.2693375644585482







  ratings[movie] = 0.0


Total error = 576282.6343409684
Progress = 576282.6343409684





Total error = -970637.3967507072
Progress = 1546920.0310916756





Total error = 912992.3385412854
Progress = 1883629.7352919925





Total error = -1612607.2560669533
Progress = 2525599.594608239





Total error = 485364.051758232
Progress = 2097971.3078251854





Total error = -11114546.562267551
Progress = 11599910.614025783





Total error = -67725237.00824444
Progress = 56610690.44597689





Total error = -234515634.43425995
Progress = 166790397.4260155





Total error = -584871000.6385416
Progress = 350355366.2042816





Total error = -1164299024.4844053
Progress = 579428023.8458637





Total error = -1689411311.880567
Progress = 525112287.3961618





Total error = -876224885.6890051
Progress = 813186426.1915619





Total error = 3618922207.3387394
Progress = 4495147093.027744





Total error = 7740826093.199941
Progress = 4121903885.8612013





Total error = -32405175707.15364
Progress = 40146001800.35

  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.1309004363347878
Recall for random: 0.2693375644585482







  ratings[movie] = 0.0


Total error = 554910.4806467879
Progress = 554910.4806467879





Total error = -975061.8415789012
Progress = 1529972.322225689





Total error = 776555.644346195
Progress = 1751617.485925096





Total error = -2919275.533545131
Progress = 3695831.177891326





Total error = -5703581.641661253
Progress = 2784306.108116122





Total error = -27591075.45520363
Progress = 21887493.813542377





Total error = -100641657.44761114
Progress = 73050581.9924075





Total error = -289906589.2824541
Progress = 189264931.83484292





Total error = -677102783.4475431
Progress = 387196194.1650891





Total error = -1328565129.542501
Progress = 651462346.0949578





Total error = -1907845030.634406
Progress = 579279901.0919051





Total error = -809154590.7081164
Progress = 1098690439.9262896





Total error = 4561390985.663953
Progress = 5370545576.372069





Total error = 8362047292.157579
Progress = 3800656306.4936266





Total error = -39390095434.00416
Progress = 47752142726.161736


  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.1309004363347878
Recall for random: 0.2693375644585482







  ratings[movie] = 0.0


Total error = 576653.2563308293
Progress = 576653.2563308293





Total error = -952075.0542117514
Progress = 1528728.3105425807





Total error = 823525.9019069389
Progress = 1775600.9561186903





Total error = -2331695.0850836146
Progress = 3155220.9869905533





Total error = -4339.358459620069
Progress = 2327355.7266239943





Total error = -18760501.440512653
Progress = 18756162.08205303





Total error = -109841914.44737585
Progress = 91081413.00686319





Total error = -371592572.0576312
Progress = 261750657.61025536





Total error = -919382177.3342464
Progress = 547789605.2766151





Total error = -1812169818.8567638
Progress = 892787641.5225174





Total error = -2552606830.716941
Progress = 740437011.860177





Total error = -942442266.4729044
Progress = 1610164564.2440364





Total error = 7318558286.240735
Progress = 8261000552.713639





Total error = 19341797007.060104
Progress = 12023238720.81937





Total error = -11796952676.28189
Progress = 31138749683.

  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.1309004363347878
Recall for random: 0.2693375644585482







  ratings[movie] = 0.0


Total error = 559803.0817355348
Progress = 559803.0817355348





Total error = -996175.5819174543
Progress = 1555978.663652989





Total error = 738329.7974149247
Progress = 1734505.379332379





Total error = -2136062.854839912
Progress = 2874392.6522548366





Total error = -1896403.379623501
Progress = 239659.47521641105





Total error = -23760694.53558592
Progress = 21864291.15596242





Total error = -111736266.70683844
Progress = 87975572.17125252





Total error = -345217476.4383198
Progress = 233481209.73148137





Total error = -804827726.4193935
Progress = 459610249.98107374





Total error = -1517006118.286543
Progress = 712178391.8671494





Total error = -1975696160.8891215
Progress = 458690042.60257864





Total error = 232943393.13556415
Progress = 2208639554.024686





Total error = 11234564709.344452
Progress = 11001621316.208887





Total error = 36917980400.901474
Progress = 25683415691.557022





Total error = 59528040483.531746
Progress = 22610060082

  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.1309004363347878
Recall for random: 0.2693375644585482







In [108]:
# obliczamy, w ilu probach wytrenowany model okazal sie lepszy od losowego
# przeprowadzamy test statystyczny - jak prawdopodobne jest to, by k pozytywnych prob bylo dzielem przypadku

def possibility_of_at_least_k_successes_in_n(k, n):
    p = 0.0
    # obliczamy kolejno prawdopodobienstwo k sukcesow, k+1 sukcesow, ...
    # przydadza Ci sie funkcje marh.comb() i math.pow()
    for i in range(k, n):
        part = math.comb(n, k)
        part = part / (pow(2, n))
        p += part
    return p

p = 0.05
metric = 'recall'

print("Positive_test_counts = " + str(positive_tests_count))
print("Number of tests = " + str(n_tests))

# positive_tests_count =  # w ilu przypadkach okazalismy sie lepsi niz random?
if possibility_of_at_least_k_successes_in_n(positive_tests_count, n_tests) <= p:
    print('We are better than random!')
else:
    print('There is no evidence we are better')

Positive_test_counts = 0
Number of tests = 5
There is no evidence we are better
