# Laboratorium 1 - content-based recommender

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold

In [2]:
# tworzymy reprezentacje filmow jako wektorow cech - na podstawie gatunkow

genres = [
    '(no genres listed)', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'IMAX', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War', 
    'Western'
]
genres_no = len(genres)

movies = pandas.read_csv('ml-latest-small/movies.csv')
movies_no = movies.shape[0]

movies['bias'] = 1.0
for genre in genres:
    movies[genre] = np.where(movies['genres'].str.contains(genre, regex=False), 1.0, 0.0)
    
movies = movies.drop(columns=['title', 'genres']).set_index('movieId')
movies

Unnamed: 0_level_0,bias,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
25311,177,4558,2.0
37510,253,608,5.0
38350,263,2692,3.5
21632,140,8207,4.0
78417,488,1957,3.0
...,...,...,...
28125,195,1041,4.0
644,6,207,4.0
97218,605,2429,3.0
28907,199,33493,2.5


In [4]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, genres_no+1)), index=raw_ratings['userId'].unique(), columns=['bias']+genres)
    return users_no, users

users_no, users = initialize_users(train_ratings_set)
users

Unnamed: 0,bias,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
177,4.577913,0.451528,4.124485,4.461209,3.731142,3.753771,3.037944,4.219438,1.242442,3.608145,...,3.522171,4.294590,4.258286,3.299619,3.492386,2.566396,4.887273,1.595912,1.433936,3.757851
253,0.559126,2.526491,1.061180,4.233278,2.162506,2.181506,0.964311,1.904505,4.968472,2.703621,...,1.704738,0.027936,4.411018,3.810054,4.633006,2.864421,1.731068,2.412037,1.936029,3.709343
263,0.230113,2.716836,4.201783,0.675183,0.308103,0.306549,1.050363,1.740744,4.830355,0.350090,...,1.673265,0.930150,3.891291,1.541752,2.926185,3.083835,1.651848,1.209608,2.332938,0.110056
140,3.949578,0.231691,2.560407,0.777420,0.001674,1.869944,0.112013,1.063078,0.169937,2.444375,...,3.065027,0.330087,2.827248,3.392413,2.346528,0.212297,1.578061,3.912742,1.707980,2.606599
488,4.418840,4.175265,2.448215,4.279393,3.646287,0.929640,0.469177,4.754789,4.417303,4.817939,...,3.849593,3.931870,3.393737,1.641328,2.260773,4.416493,3.591053,1.739870,0.913891,0.393428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,4.643632,3.246450,0.817604,2.011118,3.614066,0.817415,1.815022,4.081026,2.574541,1.518879,...,1.978426,2.800393,2.594865,0.086231,2.831381,4.068687,3.656793,3.921687,1.429071,0.352924
530,1.679817,1.821645,2.346289,3.896325,2.511168,3.068191,2.029100,3.642259,0.878152,3.441404,...,1.631603,0.111182,4.946363,2.328052,0.300873,4.847164,4.934465,4.141379,4.875016,2.283274
180,4.970417,4.508826,4.596885,2.494960,1.953050,2.849109,0.146955,2.708314,4.743059,3.662684,...,0.755557,1.036137,2.685239,4.974541,4.413887,4.290034,0.404742,0.936044,3.353996,0.974132
548,1.276024,4.896255,0.413948,2.241812,0.753389,0.436540,0.155670,0.597156,3.659607,3.717514,...,0.257677,2.660219,1.447587,2.086269,2.965739,4.081001,2.045192,3.051156,1.238128,4.111848


In [5]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow moze brakowac po podziale datasetu na dwie czesci - musimy uzueplnic brakujace kolumny

def get_ratings(raw_ratings, movies):
    ratings = raw_ratings.pivot(*raw_ratings.columns).fillna(0.0)
    missing_movies = set(movies.index).difference(set(raw_ratings['movieId']))
    for movie in missing_movies:
        ratings[movie] = 0.0
    ratings = ratings.reindex(sorted(ratings.columns), axis=1)
    return ratings

ratings = get_ratings(train_ratings_set, movies)
ratings

  ratings[movie] = 0.0


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Część 2. - trening modelu

In [8]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent

# alpha = 0.0001 # learning speed
# delta = 5000 # minimal upgrade for each step
# lambd = 0.01 # regularization weight

alpha = 0.00005 # learning speed
delta = 25 # minimal upgrade for each step
lambd = 50 # regularization weight

def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    model = users
    counter = 0

    while(True):
        previous_total_error = total_error

        # mozemy to policzyc jako iloczyn skalarny preferencji uzytkownikow i cech filmow
        predicted_ratings = model.dot(movies.transpose())
#         predicted_ratings = movies.transpose() @ users 
        # tu stosujemy bardzo przydatna funkcje NumPy
        errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
        # znow iloczyn skalarny - tym razem bledow
        gradient = errors.dot(movies)

        # tu stosujemy pewna sztuczke - rozbijamy sobie macierz z wyrazami regularyzujacymi na dwie
#         pierwsza to kolumna zlozona z zer
        regularization_k0 = pandas.DataFrame(np.zeros((users_no, 1)), index=raw_ratings['userId'].unique(), columns=['bias'])
        # druga to macierz preferencji uzytkownikow (czyli modelu) - bez pierwszej kolumny
#         regularization_k = model.iloc[:,1:]
        regularization_k = model.drop("bias", axis=1)
        
        # teraz sklejamy obie macierze
        regularization = pandas.concat([regularization_k0, regularization_k], axis=1)

#         # najwazniejszy krok - aktualizacja modelu, czyli wszystkich wag
        model = model - alpha * (gradient + lambd * regularization)

    # suma wszystkich bledow
    
        total_error = errors.sum() # suma elementow macierzy errors
        if math.isnan(total_error):
            break
        print("Total error = " + str(total_error))
        progress = abs(previous_total_error - total_error)
        print("Progress = " + str(progress))
        print("\n\n")
        counter += 1
        if progress < delta or counter > 100:
            break
            
    return model

prediction_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

Total error = 543530.9274719004
Progress = 543530.9274719004



Total error = 534482.3873537029
Progress = 9048.540118197445



Total error = 525567.2194854293
Progress = 8915.167868273566



Total error = 516783.5729523725
Progress = 8783.64653305686



Total error = 508129.61952307436
Progress = 8653.953429298126



Total error = 499603.55336043535
Progress = 8526.066162639007



Total error = 491203.59073629323
Progress = 8399.962624142121



Total error = 482927.9697493896
Progress = 8275.62098690361



Total error = 474774.95004671114
Progress = 8153.019702678488



Total error = 466742.8125481409
Progress = 8032.137498570257



Total error = 458829.8591743821
Progress = 7912.953373758763



Total error = 451034.4125781137
Progress = 7795.44659626839



Total error = 443354.81587832957
Progress = 7679.596699784161



Total error = 435789.43239782105
Progress = 7565.383480508521



Total error = 428336.6454037593
Progress = 7452.786994061724



Total error = 420994.8578513496
Progr

## Część 3. - ocena jakości algorytmu

In [None]:
# https://stackoverflow.com/questions/16729574/how-to-get-a-value-from-a-cell-of-a-dataframe

In [9]:
def prepare_for_confusion_matrix(prediction_model, movies, test_ratings_set):
    predicted_ratings = prediction_model.dot(movies.T)
    predicted_ratings = predicted_ratings.astype(int)
    predicted_ratings = predicted_ratings.sort_index(0)

    y_actual = []
    y_predicted = []

    for index, row in test_ratings_set.iterrows():
        y_actual.append(row['rating'])
        y_predicted.append(predicted_ratings.at[int(row['userId']),int(row['movieId'])])

    return y_actual, y_predicted


y_actual, y_predicted = prepare_for_confusion_matrix(prediction_model, movies, test_ratings_set)


  predicted_ratings = predicted_ratings.sort_index(0)


In [None]:
# https://www.codegrepper.com/code-examples/python/sklearn+knn+example+confusion+matrix

In [None]:
# https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal

In [None]:
# https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co

In [10]:
# na podstawie zbioru testowego i wytrenowanego modelu obliczamy metryki opisujace jakosc modelu
## sklearn.confusion_matrix()

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

positive_threshold = 4.0
negative_threshold = 2.0

# def calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold):
def calculate_stats(y_actual, y_predicted, positive_threshold, negative_threshold):
    # obliczamy true_positives itp.
    # nastepnie wszystkie metryki
    
    y_actual_labels = []
    y_predicted_labels = []
    
    for i in range(0, len(y_actual)):
        if y_actual[i] > int(positive_threshold):
            label = 'Positive'
        elif y_actual[i] <= int(negative_threshold):
            label = 'Negative'
        else:
            label = 'None'
        y_actual_labels.append(label)
    
    for i in range(0, len(y_predicted)):
        if y_predicted[i] > positive_threshold:
            label = 'Positive'
        elif y_predicted[i] < negative_threshold:
            label = 'Negative'
        else:
            label = 'None'
        y_predicted_labels.append(label)
    
    conf_m = confusion_matrix(y_actual_labels, y_predicted_labels)
    
    false_positives = conf_m.sum(axis=0) - np.diag(conf_m)  
    false_negatives = conf_m.sum(axis=1) - np.diag(conf_m)
    true_positives = np.diag(conf_m)
    true_negatives = conf_m.sum() - (false_positives + false_negatives + true_positives)
    
    # fp, ... -> suma elementow w tablicy
    false_positives = false_positives.sum()
    false_negatives = false_negatives.sum()
    true_positives = true_positives.sum()
    true_negatives = true_negatives.sum()

#     print(precision_score(y_actual_labels, y_predicted_labels, average='micro')) # liczy recall, wynik taki sam jak ponizej

        
    recall = true_positives / (true_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    f1 =  2 * ((precision * recall) / (precision + recall))
    accuracy = (true_positives + true_negatives) / (true_positives + false_positives + false_negatives + true_negatives)

    return {
        'true_positives': true_positives,
        'true_negatives': true_negatives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [11]:
predicted_ratings = prediction_model.dot(movies.T)
calculate_stats(y_actual, y_predicted, positive_threshold, negative_threshold)

{'true_positives': 1406,
 'true_negatives': 6448,
 'false_positives': 3636,
 'false_negatives': 3636,
 'accuracy': 0.5192383974613248,
 'precision': 0.2788575961919873,
 'recall': 0.2788575961919873,
 'f1': 0.2788575961919873}

In [12]:
# dla porownania - obliczmy te same metryki dla modelu losowego
# zauwaz, w jaki sposob ponownie wykorzystujemy funkcje inicjalizujaca preferencje uzytkownikow

# tamten kod opakowac w funkcje zeby zroic tez dla random_prediction

_, random_model = initialize_users(train_ratings_set)
random_prediction = random_model.dot(movies.T)
print(random_prediction)
print("\n\n\n\n")
print(predicted_ratings)







# y_actual, y_predicted = prepare_for_confusion_matrix(prediction_model, movies, test_ratings_set)
y_actual_random, y_predicted_random = prepare_for_confusion_matrix(random_model, movies, train_ratings_set)
calculate_stats(y_actual_random, y_predicted_random, positive_threshold, negative_threshold)

movieId     1          2          3          4         5          6       \
177      16.402133   8.201356  11.026164  13.114792  7.470501  11.542072   
253      16.612852   8.523724   5.010770   8.415075  4.981220   8.902144   
263      17.563003  13.061690   8.267728   8.666755  6.457512   7.266473   
140       9.826916   8.415555   3.911847   8.083466  1.793618   7.611514   
488       7.752827   4.156142   5.983419   8.744422  4.193861   8.994611   
..             ...        ...        ...        ...       ...        ...   
299      13.168943   7.005631   8.725069   9.362520  4.703583  11.865735   
530      19.547426  10.927815   9.428384  10.087793  9.243465  10.277214   
180       8.492719   6.982303   3.884541   6.409547  1.279561  13.153331   
548      19.464449  15.375076   2.827088   7.135503  1.918883   9.577502   
145      18.844928  10.838260  12.724959  14.208757  8.447598  10.971558   

movieId     7          8         9          10      ...     193565     193567  \
177   

  predicted_ratings = predicted_ratings.sort_index(0)


{'true_positives': 25387,
 'true_negatives': 121181,
 'false_positives': 70407,
 'false_negatives': 70407,
 'accuracy': 0.5100110654111949,
 'precision': 0.2650165981167923,
 'recall': 0.2650165981167923,
 'f1': 0.2650165981167923}

## Część 4. - istotność statystyczna

In [16]:
# wielokrotnie uruchamiamy trening modelu
# za każdym razem dzielimy dataset na zbior treningowy i testowy w inny sposob - klasa KFold robi to za nas
# zwroc uwage na bardzo istotny szczegol - oba modele, wytrenowany i losowy, musza byc porownywane na tym samym zbiorze testowym

n_tests = 5
positive_tests_count = 0
results = []
random_results = []
# alpha = 0.01 # learning speed
# delta = 500 # minimal upgrade for each step
# lambd = 0.1 # regularization weight

alpha = 0.00005 # learning speed
delta = 25 # minimal upgrade for each step
lambd = 50 # regularization weight

# for test, train in KFold(n_splits=n_tests, shuffle=True).split(raw_ratings):
for train, test in KFold(n_splits=n_tests, shuffle=True).split(train_ratings_set):
    # wygeneruj macierz użytkowników i ocen
    # wytrenuj model
    # oblicz metryki dla wytrenowanego modelu
    # oblicz metryki dla modelu losowego
    _train_ratings_set, _test_ratings_set = train_test_split(all_ratings, test_size=0.05)
    users_no, users = initialize_users(_train_ratings_set)
    ratings = get_ratings(train_ratings_set, movies)
#     ratings = get_ratings(raw_ratings, movies)
    model = calculate_user_preferences(users, movies, ratings, _train_ratings_set, users_no, movies_no, alpha, delta, lambd)
    y_actual, y_predicted = prepare_for_confusion_matrix(model, movies, _test_ratings_set)
    y_actual_random, y_predicted_random = prepare_for_confusion_matrix(random_model, movies, _test_ratings_set)
    stats = calculate_stats(y_actual, y_predicted, positive_threshold, negative_threshold)
    random_stats = calculate_stats(y_actual_random, y_predicted_random, positive_threshold, negative_threshold)
    if stats['recall'] > random_stats['recall']:
        positive_tests_count += 1
    print("Recall for trained: " + str(stats['recall']))
    print("Recall for random: " + str(random_stats['recall']))
    print("\n\n\n\n")
    

  ratings[movie] = 0.0


Total error = 558214.3327019858
Progress = 558214.3327019858



Total error = 547516.9406151855
Progress = 10697.392086800304



Total error = 537000.2193661038
Progress = 10516.721249081776



Total error = 526659.6879785269
Progress = 10340.531387576833



Total error = 516491.03845709795
Progress = 10168.649521428975



Total error = 506490.1267514244
Progress = 10000.911705673556



Total error = 496652.96424806677
Progress = 9837.162503357627



Total error = 486975.7097583313
Progress = 9677.254489735467



Total error = 477454.6619717282
Progress = 9521.0477866031



Total error = 468086.2523467752
Progress = 9368.409624952998



Total error = 458867.03841254336
Progress = 9219.21393423184



Total error = 449793.69745590835
Progress = 9073.340956635016



Total error = 440863.0205710454
Progress = 8930.676884862944



Total error = 432071.9070490197
Progress = 8791.113522025698



Total error = 423417.3590867586
Progress = 8654.547962261131



Total error = 414896.47679583
Prog

  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.28639428798095995
Recall for random: 0.25763585878619594







  ratings[movie] = 0.0


Total error = 533361.3626108492
Progress = 533361.3626108492



Total error = 525110.8108660289
Progress = 8250.551744820317



Total error = 516976.82313988946
Progress = 8133.987726139429



Total error = 508957.6493154624
Progress = 8019.173824427067



Total error = 501051.56328077056
Progress = 7906.086034691834



Total error = 493256.862597861
Progress = 7794.700682909577



Total error = 485571.8681768548
Progress = 7684.99442100618



Total error = 477994.923954922
Progress = 7576.94422193279



Total error = 470524.39658013236
Progress = 7470.527374789643



Total error = 463158.6751001023
Progress = 7365.721480030043



Total error = 455896.170655386
Progress = 7262.504444716324



Total error = 448735.31617754756
Progress = 7160.8544778384385



Total error = 441674.56609183433
Progress = 7060.750085713225



Total error = 434712.3960244156
Progress = 6962.170067418716



Total error = 427847.30251410545
Progress = 6865.093510310166



Total error = 421077.80272852286
Progr

  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.3183260610868703
Recall for random: 0.271122570408568







  ratings[movie] = 0.0


Total error = 561600.3351243781
Progress = 561600.3351243781



Total error = 552647.1358658064
Progress = 8953.199258571723



Total error = 543827.2407298741
Progress = 8819.895135932253



Total error = 535138.6799778775
Progress = 8688.560751996585



Total error = 526579.5095843566
Progress = 8559.170393520966



Total error = 518147.8109162173
Progress = 8431.698668139288



Total error = 509841.6904159745
Progress = 8306.120500242803



Total error = 501659.27928906825
Progress = 8182.411126906227



Total error = 493598.73319520464
Progress = 8060.54609386361



Total error = 485658.23194365675
Progress = 7940.501251547888



Total error = 477835.9791924908
Progress = 7822.252751165943



Total error = 470130.2021516593
Progress = 7705.777040831512



Total error = 462539.1512899132
Progress = 7591.050861746073



Total error = 455061.1000454881
Progress = 7478.051244425122



Total error = 447694.3445405083
Progress = 7366.75550497981



Total error = 440437.2032990701
Progres

  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.2812376041253471
Recall for random: 0.2631892106307021







  ratings[movie] = 0.0


Total error = 578605.8145534822
Progress = 578605.8145534822



Total error = 569727.9218448219
Progress = 8877.892708660336



Total error = 560978.1567294872
Progress = 8749.765115334652



Total error = 552355.0090821182
Progress = 8623.147647369071



Total error = 543856.9850835276
Progress = 8498.023998590535



Total error = 535482.6070523461
Progress = 8374.37803118152



Total error = 527230.4132789375
Progress = 8252.193773408653



Total error = 519098.95786152076
Progress = 8131.455417416699



Total error = 511086.8105444918
Progress = 8012.147317028954



Total error = 503192.55655889644
Progress = 7894.2539855953655



Total error = 495414.79646504123
Progress = 7777.760093855206



Total error = 487752.1459971938
Progress = 7662.650467847416



Total error = 480203.2359103684
Progress = 7548.910086825432



Total error = 472766.7118291382
Progress = 7436.524081230164



Total error = 465441.2340984767
Progress = 7325.477730661514



Total error = 458225.4776365726
Progr

  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.29888932963109877
Recall for random: 0.2608092026973423







  ratings[movie] = 0.0


Total error = 539806.8649162396
Progress = 539806.8649162396



Total error = 531438.8274604848
Progress = 8368.03745575482



Total error = 523181.1472571484
Progress = 8257.680203336407



Total error = 515032.4439168495
Progress = 8148.703340298904



Total error = 506991.35629014706
Progress = 8041.087626702443



Total error = 499056.5422410707
Progress = 7934.814049076347



Total error = 491226.67842329387
Progress = 7829.863817776844



Total error = 483500.4600589209
Progress = 7726.218364372966



Total error = 475876.60071985074
Progress = 7623.859339070157



Total error = 468353.8321116926
Progress = 7522.768608158163



Total error = 460930.90386019665
Progress = 7422.928251495934



Total error = 453606.5833001653
Progress = 7324.320560031338



Total error = 446379.6552668279
Progress = 7226.9280333374045



Total error = 439248.92188963824
Progress = 7130.7333771896665



Total error = 432213.202388464
Progress = 7035.71950117423



Total error = 425271.33287214005
Pro

  predicted_ratings = predicted_ratings.sort_index(0)


Recall for trained: 0.2881792939309798
Recall for random: 0.2631892106307021







In [17]:
# obliczamy, w ilu probach wytrenowany model okazal sie lepszy od losowego
# przeprowadzamy test statystyczny - jak prawdopodobne jest to, by k pozytywnych prob bylo dzielem przypadku

def possibility_of_at_least_k_successes_in_n(k, n):
    p = 0.0
    # obliczamy kolejno prawdopodobienstwo k sukcesow, k+1 sukcesow, ...
    # przydadza Ci sie funkcje marh.comb() i math.pow()
    for i in range(k, n):
        part = math.comb(n, k)
        part = part / (pow(2, n))
        p += part
    return p

p = 0.05
metric = 'recall'

print("Positive_test_counts = " + str(positive_tests_count))
print("Number of tests = " + str(n_tests))

# positive_tests_count =  # w ilu przypadkach okazalismy sie lepsi niz random?
if possibility_of_at_least_k_successes_in_n(positive_tests_count, n_tests) <= p:
    print('We are better than random!')
else:
    print('There is no evidence we are better')

Positive_test_counts = 5
Number of tests = 5
We are better than random!
