# Laboratorium 1 - content-based recommender

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold

In [2]:
# tworzymy reprezentacje filmow jako wektorow cech - na podstawie gatunkow

genres = [
    '(no genres listed)', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'IMAX', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War', 
    'Western'
]
genres_no = len(genres)

movies = pandas.read_csv('ml-latest-small/movies.csv')
movies_no = movies.shape[0]

movies['bias'] = 1.0
for genre in genres:
    movies[genre] = np.where(movies['genres'].str.contains(genre, regex=False), 1.0, 0.0)
    
movies = movies.drop(columns=['title', 'genres']).set_index('movieId')
movies

Unnamed: 0_level_0,bias,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
78406,488,1223,4.5
32082,220,648,3.0
42878,288,4019,3.0
61713,408,160563,3.5
31601,219,913,4.5
...,...,...,...
64639,414,54256,1.5
15768,103,4881,5.0
93372,599,2334,2.5
29432,202,537,3.0


In [4]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, genres_no+1)), index=raw_ratings['userId'].unique(), columns=['bias']+genres)
    return users_no, users

users_no, users = initialize_users(train_ratings_set)
users

Unnamed: 0,bias,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
427,1.889738,3.180737,2.599051,1.183608,1.140261,2.103706,2.890012,4.915758,3.186870,4.624745,...,1.760264,3.266167,3.216441,1.287855,3.947325,2.113970,1.435626,1.377056,3.935570,3.325519
357,1.489289,0.272802,3.408779,0.940798,1.999175,0.216589,3.972829,0.850834,2.764227,3.814690,...,4.524383,4.522930,4.378807,1.560737,4.654868,3.673750,4.088215,1.699955,0.125793,0.018443
177,1.560249,1.225888,0.436308,4.732954,0.476936,0.539614,0.618855,1.362431,0.648818,3.470221,...,3.665584,3.453128,3.778277,2.322913,0.914471,0.211495,1.190987,3.536493,4.428607,2.464198
282,0.917688,0.974414,4.189947,2.718232,4.880500,2.320363,2.304441,2.825639,0.449559,3.516685,...,3.649542,4.399842,2.069391,1.554769,4.985934,0.463658,3.813881,4.963195,1.615923,4.881246
414,3.072434,3.946296,1.958489,0.956891,2.001080,1.652357,0.015941,4.696303,2.575890,1.894580,...,1.382204,3.625066,4.847774,3.568418,1.599265,1.763447,0.881970,3.895336,1.338887,4.628302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,3.417806,0.505694,4.439568,1.447335,4.098696,4.245801,2.662133,0.491958,4.341223,1.325881,...,2.909160,1.910162,0.575130,4.009465,2.416158,3.929605,3.678713,3.110752,3.454008,2.746524
547,2.619386,1.492414,0.345166,1.059544,0.046263,2.194732,0.221412,4.438392,0.342579,0.196487,...,1.685270,0.599855,2.068183,0.241047,0.215065,2.761273,4.551661,3.613004,4.687985,0.530253
557,2.291026,2.999587,1.968188,0.910130,4.030467,2.823557,2.184802,1.776955,2.509543,3.148025,...,1.155275,1.637929,1.643671,2.651110,4.374394,2.210309,3.353277,4.089236,1.702532,3.405576
544,4.782991,1.985777,1.166136,2.437272,3.228261,0.604195,1.447623,1.755195,4.289584,2.989166,...,3.497404,3.335984,1.946167,4.873460,4.556294,0.358003,4.185561,4.658200,2.897081,1.099907


In [5]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow moze brakowac po podziale datasetu na dwie czesci - musimy uzueplnic brakujace kolumny

def get_ratings(raw_ratings, movies):
    ratings = raw_ratings.pivot(*raw_ratings.columns).fillna(0.0)
    missing_movies = set(movies.index).difference(set(raw_ratings['movieId']))
    for movie in missing_movies:
        ratings[movie] = 0.0
    ratings = ratings.reindex(sorted(ratings.columns), axis=1)
    return ratings

ratings = get_ratings(train_ratings_set, movies)
ratings

  ratings[movie] = 0.0


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Część 2. - trening modelu

In [88]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent

alpha = 0.0001 # learning speed
delta = 100 # minimal upgrade for each step
lambd = 0.01 # regularization weight

def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    model = users

    while(True):
        previous_total_error = total_error

        predicted_ratings = np.dot(model, movies.T)# mozemy to policzyc jako iloczyn skalarny preferencji uzytkownikow i cech filmow
        # tu stosujemy bardzo przydatna funkcje NumPy
        errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
        gradient = np.dot(errors, movies)# znow iloczyn skalarny - tym razem bledow

        # tu stosujemy pewna sztuczke - rozbijamy sobie macierz z wyrazami regularyzujacymi na dwie
        # pierwsza to kolumna zlozona z zer
        regularization_k0 = pandas.DataFrame(np.zeros((users_no, 1)), index=raw_ratings['userId'].unique(), columns=['bias'])
        # druga to macierz preferencji uzytkownikow (czyli modelu) - bez pierwszej kolumny
        regularization_k = model.drop(model.columns[[0]], axis=1, inplace=False)# ...
        # teraz sklejamy obie macierze
        regularization = pandas.concat([regularization_k0, regularization_k], axis=1)

        # najwazniejszy krok - aktualizacja modelu, czyli wszystkich wag
        model = model - (alpha * (regularization + (lambd * gradient)))

        total_error = errors.sum()# suma wszystkich bledow
        print(total_error)
        progress = abs(previous_total_error - total_error)
        if progress < delta:
            break
            
    return model

# prediction_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

## Część 3. - ocena jakości algorytmu

In [21]:
# na podstawie zbioru testowego i wytrenowanego modelu obliczamy metryki opisujace jakosc modelu

positive_threshold = 4.0
negative_threshold = 2.0

def calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold):
    # obliczamy true_positives itp.
    # nastepnie wszystkie metryki
    
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    
    for _, row in test_ratings_set.iterrows():
        
        my_prediction = predicted_ratings[row['movieId']][row['userId']]
        real_rating = row['rating']
 
        if real_rating >= positive_threshold and my_prediction >= positive_threshold:
            true_positives += 1
        elif real_rating >= positive_threshold and my_prediction <= positive_threshold:
            false_positives += 1
        elif real_rating <= negative_threshold and my_prediction <= negative_threshold:
            true_negatives += 1
        elif real_rating <= negative_threshold and my_prediction >= negative_threshold:
            false_negatives += 1
    
    accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = (2 * precision * recall) / (precision + recall)
        
    return {
        'true_positives': true_positives,
        'true_negatives': true_negatives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

predicted_ratings = prediction_model.dot(movies.T)
calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold)

{'true_positives': 1972,
 'true_negatives': 21,
 'false_positives': 433,
 'false_negatives': 663,
 'accuracy': 0.6451926189705406,
 'precision': 0.81995841995842,
 'recall': 0.7483870967741936,
 'f1': 0.7825396825396826}

In [22]:
predicted_ratings = prediction_model.dot(movies.T)
calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold)

{'true_positives': 1972,
 'true_negatives': 21,
 'false_positives': 433,
 'false_negatives': 663,
 'accuracy': 0.6451926189705406,
 'precision': 0.81995841995842,
 'recall': 0.7483870967741936,
 'f1': 0.7825396825396826}

In [23]:
# dla porownania - obliczmy te same metryki dla modelu losowego
# zauwaz, w jaki sposob ponownie wykorzystujemy funkcje inicjalizujaca preferencje uzytkownikow

_, random_model = initialize_users(train_ratings_set)
random_prediction = random_model.dot(movies.T)
calculate_stats(test_ratings_set, random_prediction, positive_threshold, negative_threshold)

{'true_positives': 2190,
 'true_negatives': 13,
 'false_positives': 215,
 'false_negatives': 671,
 'accuracy': 0.7131757850437035,
 'precision': 0.9106029106029107,
 'recall': 0.7654666200629151,
 'f1': 0.8317508545385492}

## Część 4. - istotność statystyczna

In [90]:
# wielokrotnie uruchamiamy trening modelu
# za każdym razem dzielimy dataset na zbior treningowy i testowy w inny sposob - klasa KFold robi to za nas
# zwroc uwage na bardzo istotny szczegol - oba modele, wytrenowany i losowy, musza byc porownywane na tym samym zbiorze testowym

n_tests = 5
results = []
random_results = []
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
parsed_ratings = get_ratings(train_ratings_set, movies)

for train, test in KFold(n_splits=n_tests, shuffle=True).split(parsed_ratings):
    # wygeneruj macierz użytkowników i ocen
#     print(train)
    
    train_data = parsed_ratings.iloc[train]
    test_data = all_ratings
    
    print(parsed_ratings.shape)
    print(train_data.shape)
    print(movies.shape)
    
#     print(train_data)
    
    _, random_model = initialize_users(all_ratings)
    random_prediction = random_model.dot(movies.T)
    
    # wytrenuj model
    _, users = initialize_users(all_ratings)
    prediction_model = calculate_user_preferences(users, movies, parsed_ratings, all_ratings, users_no, movies_no, alpha, delta, lambd)
    
    # oblicz metryki dla modelu losowego
    rand_recall = calculate_stats(test_data, random_prediction, positive_threshold, negative_threshold)['recall']
    
    # oblicz metryki dla wytrenowanego modelu
    predicted_ratings = prediction_model.dot(movies.T)
    trained_recall = calculate_stats(test_data, predicted_ratings, positive_threshold, negative_threshold)['recall']
    
    results.append(trained_recall)
    random_results.append(rand_recall)
    
    print(f"rand vs traind: {rand_recall} vs {trained_recall}")

  ratings[movie] = 0.0


(610, 9742)
(488, 9742)
(9742, 21)
544274.2081603738
543665.7417246558
543058.4570637876
542452.3508424017
541847.4197364257
541243.6604330386
540641.0696306337
540039.6440387728
539439.3803781448
538840.2753805277
538242.325788744
537645.5283566231
537049.8798489573
536455.3770414636
535862.0167207438
535269.7956842412
534678.7107402016
534088.7587076391
533499.9364162838
532912.2407065567
532325.6684295173
531740.2164468352
531155.8816307443
530572.6608640021
529990.5510398591
529409.5490620129
528829.6518445715
528250.8563120156
527673.1593991589
527096.5580511142
526521.0492232493
525946.6298811528
525373.2970005975
524801.047567498
524229.8785778799
523659.7870378361
523090.76996349567
522522.82438098197
521955.94732637727
521390.13584568986
520825.3869948095
520261.6978394806
519699.0654552589
519137.4869274771
518576.9593512113
518017.47983124136
517459.04548201937
516901.6534276306
516345.30080175935
515789.9847476547
515235.7024180944
514682.4509753472
514130.2275911449
513579

354610.8337921549
354314.6077461513
354018.774467031
353723.3331273996
353428.2829021914
353133.622968658
352839.3525063674
352545.4706971891
352251.976725295
351958.8697771417
351666.14904147177
351373.81370930135
351081.86297391413
350790.29603085195
350499.1120779119
350208.31031513354
349917.8899447944
349627.8501714018
349338.1902016859
349048.9092445928
348760.0065112757
348471.4812150883
348183.3325715796
347895.55979848246
347608.1621157096
347321.138745345
347034.48891163897
346748.2118409981
346462.30676197825
346176.7729052804
345891.60950374196
345606.81579232676
345322.39100812544
345038.33439033985
344754.6451802817
344471.3226213635
344188.3659590939
343905.774441067
343623.54731695936
343341.6838385195
343060.183259565
342779.0448359728
342498.26782567386
342217.85148864443
341937.7950869039
341658.09788450116
341378.75914751575
341099.77814404573
340821.15414420137
340542.8864201026
340264.9742458664
339987.41689760674
339710.21365342423
339433.36379339936
339156.86659

252054.6403351791
251876.41829247188
251698.3710996441
251520.49848603515
251342.8001815748
251165.27591678567
250987.9254227785
250810.74843124964
250633.7446744824
250456.91388534382
250280.25579728314
250103.77014433054
249927.45666109465
249751.31508276195
249575.3451450942
249399.54658442893
249223.9191376747
249048.4625423124
248873.17653639117
248698.06085852964
248523.11524791125
248348.3394442858
248173.73318796486
247999.2962198225
247825.0282812937
247650.9291143702
247476.99846160205
247303.23606609507
247129.64167150858
246956.21502205468
246782.95586249532
246609.8639381439
246436.93899486063
246264.18077905182
246091.58903766974
245919.16351820875
245746.9039687069
245574.8101377413
245402.8817744291
245231.11862842398
245059.5204499166
244888.0869896324
244716.81799882816
244545.71322929405
244374.77243335065
244203.9953638456
244033.38177415618
243862.93141818326
243692.64405035446
243522.5194256182
243352.55729944594
243182.75742782967
243013.11956727834
242843.643474

187440.92974325974
187319.88206825333
187198.93035651755
187078.07449238116
186957.31436037007
186836.64984520603
186716.08083180682
186595.60720528598
186475.22885095203
186354.94565430787
186234.75750105068
186114.66427707145
185994.665868454
185874.76216147604
185754.95304260636
185635.23839850744
185515.61811603192
185396.09208222403
185276.66018431893
185157.3223097431
185038.07834610998
184918.92818122587
184799.87170308386
184680.90879986619
184562.03935994473
184443.26327187725
184324.58042440985
184205.9907064753
184087.49400719357
183969.09021586968
183850.77922199576
183732.56091524794
183614.43518548907
183496.40192276362
183378.4610173037
183260.6123595222
183142.85584001718
183025.1913495689
182907.6187791396
182790.13801987516
182672.74896310124
182555.45150032657
182438.2455232399
182321.1309237108
182204.1075937879
182087.1754257012
181970.33431185942
181853.58414485015
181736.92481743885
181620.3562225708
181503.878253367
181387.49080312825
181271.19376532946
181154.9

451462.3863066996
450999.865856015
450538.15916677617
450077.26410748984
449617.1785535022
449157.9003869815
448699.427496881
448241.7577789255
447784.8891355834
447328.81947604223
446873.5467161876
446419.068778573
445965.3835924051
445512.48909351265
445060.38322432566
444609.0639338517
444158.5291776563
443708.77691783087
443259.8051229779
442811.6117681841
442364.1948349987
441917.55231140845
441471.68219181686
441026.5824770214
440582.25117418775
440138.68629683374
439695.8858647989
439253.8479042273
438812.570447545
438372.0515334334
437932.28920681466
437493.28151882166
437055.02652678115
436617.5222941895
436180.7668906923
435744.7583920616
435309.4948801741
434874.974442991
434441.1951745354
434008.15517486946
433575.85255007783
433144.2854122384
432713.4518794101
432283.35007560614
431853.978130773
431425.3341807718
430997.4163673586
430570.2228381585
430143.7517466488
429718.00125213695
429292.96951974253
428868.6547203731
428445.0550307058
428022.16863316705
427599.99371591

302547.73758491554
302308.26446600445
302069.08424859756
301830.19636111456
301591.6002334813
301353.2952971307
301115.28098499216
300877.5567314906
300640.121972541
300402.9761455425
300166.11868937395
299929.5490443923
299693.2666524227
299457.27095675794
299221.56140215346
298986.1374348202
298750.99850242335
298516.14405407436
298281.57354032935
298047.28641318355
297813.2821260664
297579.56013383705
297346.1198927799
297112.96086060203
296880.0824964247
296647.4842607829
296415.1656156199
296183.12602427986
295951.36495150824
295719.8818634444
295488.6762276178
295257.7475129428
295027.0951897167
294796.71872961416
294566.6176056815
294336.79129233386
294107.2392653516
293877.9610018765
293648.9559804017
293420.22368077695
293191.76358419674
292963.5751731992
292735.65793166205
292508.01134479797
292280.63489915
292053.5280825874
291826.6903843035
291600.1212948078
291373.8203059261
291147.7869107945
290922.0206038525
290696.52088084526
290471.28723881324
290246.3191760927
290021.

218924.51013834245
218774.68398139026
218624.99662693744
218475.44787261778
218326.03751648212
218176.76535699892
218027.6311930513
217878.63482393752
217729.77604937062
217581.054669474
217432.470484783
217284.02329624447
217135.71290521315
216987.5391134523
216839.5017231329
216691.6005368304
216543.83535752632
216396.2059886063
216248.71223385827
216101.35389747145
215954.13078403653
215807.0426985436
215660.08944638073
215513.27083333544
215366.586665589
215220.03674972078
215073.62089270278
214927.33890190124
214781.19058507492
214635.17575037337
214489.2942063373
214343.54576189572
214197.9302263669
214052.447409456
213907.09712125396
213761.87917223893
213616.7933732706
213471.83953559404
213327.01747083603
213182.32699100458
213037.76790848747
212893.34003605339
212749.04318684785
212604.87717439412
212460.84181259206
212316.93691571744
212173.16229841954
212029.51777572173
211886.00316301952
211742.61827608026
211599.36293104128
211456.23694441142
211313.24013306582
211170.372

164076.16898079586
163972.4325685722
163868.7747627012
163765.19547284252
163661.69460880157
163558.27208052936
163454.92779812418
163351.66167182894
163248.47361203283
163145.3635292692
163042.33133421652
162939.37693769817
162836.50025068034
162733.70118427507
162630.9796497361
162528.33555846137
162425.76882199157
162323.27935200935
162220.86706034097
162118.53185895324
162016.27365995603
161914.09237560007
161811.98791827634
161709.96020051854
161608.00913499857
161506.13463453052
161404.3366120676
161302.61498070258
161200.96965366803
161099.40054433473
160997.90756621276
160896.49063295088
160795.14965833607
160693.8845562922
160592.69524088173
160491.58162630425
160390.54362689608
160289.5811571301
160188.6941316162
160087.88246510067
159987.1460724644
159886.48486872498
159785.8987690347
159685.3876886815
159584.9515430875
159484.59024780858
159384.30371853762
159284.09187109742
159183.9546214477
159083.8918856796
158983.9035800184
rand vs traind: 0.7714663143989432 vs 0.749782

378090.6023346331
377765.1427003546
377440.13105784624
377115.5664397724
376791.4478815868
376467.77442152274
376144.5451005805
375821.7589625227
375499.4150538623
375177.5124238517
374856.0501244779
374535.02721044916
374214.4427391876
373894.2957708187
373574.5853681654
373255.3105967337
372936.47052471
372618.06422294554
372300.09076495096
371982.5492268885
371665.43868755846
371348.7582283967
371032.5069334573
370716.68388941314
370401.28818553966
370086.31891371
369771.77516838507
369457.65604660404
369143.96064797684
368830.68807467545
368517.83743142395
368205.4078254914
367893.39836668223
367581.80816733005
367270.6363422845
366959.88200890797
366649.5442870628
366339.62229910516
366030.1151698778
365721.0220266992
365412.3419993537
365104.07422009145
364796.2178236082
364488.77194704855
364181.73572998814
363875.1083144332
363568.88884480594
363263.0764679429
362957.6703330817
362652.6695918543
362348.07339827996
362043.8809087575
361740.09128205676
361436.7036793067
361133.71

266655.2677705538
266463.0641610244
266271.05675197917
266079.2452333777
265887.6292958736
265696.2086308044
265504.98293019243
265313.9518867441
265123.11519384605
264932.47254556254
264742.0236366376
264551.7681624879
264361.705819204
264171.83630354836
263982.15931295283
263792.6745455154
263603.3817000013
263414.2804758381
263225.3705731176
263036.65169258823
262848.1235356591
262659.7858043946
262471.6382015136
262283.6804303885
262095.91219504087
261908.33320014252
261720.94315101113
261533.7417536114
261346.72871454986
261159.90374107545
260973.2665410778
260786.8168230828
260600.55429625415
260414.4786703892
260228.58965591848
260042.8869639032
259857.37030603475
259672.03939463
259486.89394263361
259301.93366361284
259117.1582717575
258932.56748187827
258748.16100940423
258563.9385703816
258379.89988147127
258196.04465994914
258012.3726237016
257828.88349122583
257645.5769816278
257462.4528146191
257279.51071051773
257096.75039024532
256914.17157532307
256731.77398787515
25654

197255.9685236958
197127.7483594574
196999.63419740344
196871.62590670516
196743.72335675659
196615.9264171778
196488.2349578098
196360.64884871696
196233.16796018524
196105.7921627224
195978.52132705686
195851.35532413734
195724.2940251335
195597.3373014335
195470.4850246448
195343.73706659325
195217.09329932398
195090.55359509744
194964.11782639293
194837.78586590596
194711.55758654684
194585.43286144378
194459.41156393845
194333.49356758714
194207.67874616123
194081.9669736456
193956.35812423727
193830.85207234768
193705.4486925989
193580.1478598261
193454.9494490753
193329.85333560308
193204.8593948775
193079.96750257476
192955.17753458174
192830.4893669947
192705.9028761173
192581.41793846185
192457.03443074887
192332.75222990438
192208.5712130634
192084.4912575653
191960.51224095587
191836.63404098636
191712.85653561354
191589.17960299685
191465.6031215015
191342.1269696955
191218.75102634978
191095.47517043853
190972.2992811375
190849.22323782442
190726.2469200785
190603.3702076

493761.0311858685
493246.72934706096
492733.32932865986
492220.8287709218
491709.22532177175
491198.5166367786
490688.7003791286
490179.7742195931
489671.7358365067
489164.5829157344
488658.3131506487
488152.9242421
487648.4138983871
487144.77983523585
486642.0197757682
486140.13145047653
485639.11259719526
485138.9609610766
484639.6742945626
484141.2503573593
483643.6869164112
483146.98174587206
482651.13262708165
482156.13734853815
481661.9937058747
481168.6995018279
480676.25254621997
480184.6506559271
479693.8916548553
479203.97337391716
478714.89365100226
478226.65033095627
477739.2412655534
477252.6643134737
476766.9173402728
476281.9982183643
475797.9048269875
475314.63505219034
474832.1867867991
474350.55793039483
473869.74638929206
473389.7500765127
472910.56691175816
472432.1948213913
471954.6317384108
471477.87560242356
471001.9243596253
470526.77596277447
470052.42837116827
469578.8795506219
469106.1274734419
468634.1701184048
468163.0054707309
467692.6315220667
467223.0462

328054.44110164786
327788.0019271816
327521.8902683844
327256.10549532284
326990.64697971416
326725.5140949189
326460.70621593745
326196.2227194048
325932.062983585
325668.2263883653
325404.7123152522
325141.5201473665
324878.64926943625
324616.09906779253
324353.86893036583
324091.95824668056
323830.36640784703
323569.09280656086
323308.13683709444
323047.4978952944
322787.17537857377
322527.1686859111
322267.47721784224
322008.1003764558
321749.0375653904
321490.2881898262
321231.8516564837
320973.727373616
320715.9147510079
320458.41319996514
320201.2221333146
319944.3409653973
319687.7691120643
319431.5059906727
319175.5510200787
318919.9036206349
318664.5632141841
318409.5292240565
318154.80107506335
317900.3781934932
317646.2600071063
317392.44594513124
317138.93543825933
316885.72791864275
316632.8228198834
316380.2195770358
316127.9176265986
315875.9164065107
315624.2153561476
315372.8139163141
315121.71152924595
314870.9076385964
314620.4016894408
314370.19312826585
314120.281

234890.81966785603
234725.41673539064
234560.1700405182
234395.07935745904
234230.14446088695
234065.3651259322
233900.74112817703
233736.27224365718
233571.95824885825
233407.7989207162
233243.79403661503
233079.94337438722
232916.24671231065
232752.7038291092
232589.31450394975
232426.07851644306
232262.99564664176
232100.0656750369
231937.2883825619
231774.66355058653
231612.19096091914
231449.87039580307
231287.70163791595
231125.68447037126
230963.81867671415
230802.104040921
230640.5403473984
230479.12738098358
230317.86492694134
230156.75277096292
229995.79069916694
229834.97849809678
229674.3159547183
229513.80285642177
229353.43899101863
229193.22414674071
229033.1581122393
228873.2406765851
228713.471629265
228553.85076018312
228394.3778596584
228235.05271842467
228075.87512762743
227916.8448788265
227757.96176399116
227599.22557550087
227440.63610614403
227282.1931491179
227123.89649802627
226965.7459468782
226807.74129008714
226649.88232247243
226492.16883925372
226334.6006

174591.5880507311
174478.11787471836
174364.7362216004
174251.4429888207
174138.23807398518
174025.12137486352
173912.09278938797
173799.1522156539
173686.29955191916
173573.53469660218
173460.85754828426
173348.26800570753
173235.7659677747
173123.35133354986
173011.0240022563
172898.78387327815
172786.63084615872
172674.56482060006
172562.5856964645
172450.6933737713
172338.88775269964
172227.1687335854
172115.53621692304
172003.9901033633
171892.5302937144
171781.15668894205
171669.86919016662
171558.66769866605
171447.55211587247
171336.5223433755
171225.5782829175
171114.719836397
171003.94690586638
170893.25939353253
170782.6572017563
170672.14023305115
170561.70839008436
170451.36157567578
170341.0996927985
170230.92264457728
170120.83033428818
170010.8226653601
169900.89954137226
169791.06086605586
169681.30654329175
169571.63647711248
169462.05057169896
169352.5487313831
169243.13086064585
169133.79686411793
169024.54664657803
168915.38011295378
168806.29716832147
168697.29771

418557.73748128855
418166.1544808453
417775.19341373385
417384.8527468979
416995.1309520819
416606.0265058097
416217.5378893714
415829.66358880326
415442.4020948741
415055.75190306647
414669.71151355916
414284.2794312143
413899.45416555664
413515.2342307603
413131.6181456309
412748.6044335875
412366.19162265037
411984.3782454236
411603.1628390735
411222.54394532304
410842.52011042414
410463.08988515113
410084.251824779
409706.00448907184
409328.3464422627
408951.2762530393
408574.7924945331
408198.8937442949
407823.5785842873
407448.84560086456
407074.69338475924
406701.12053106574
406328.1256392238
405955.70731300855
405583.864160508
405212.5947941135
404841.8978305012
404471.7718906206
404102.2155996746
403733.2275871102
403364.8064865982
402996.95093602256
402629.65957746346
402262.93105718447
401896.76402561547
401531.15713733877
401166.10905107565
400801.6184296741
400437.6839400854
400074.30425336154
399711.4780446327
399349.20399309625
398987.4807820023
398626.30709863745
398265

289019.6675201806
288803.31034572865
288587.1948385174
288371.3205645588
288155.6870909536
287940.29398588603
287725.14081862307
287510.2271595079
287295.55257995805
287081.11665246414
286866.918950583
286652.9590489364
286439.23652320716
286225.7509501364
286012.5019075212
285799.4889742086
285586.7117300945
285374.1697561208
285161.86263426987
284949.7899475641
284737.9512800602
284526.3462168483
284314.97434404853
284103.8352488043
283892.92851928505
283682.2537446782
283471.8105151883
283261.5984220345
283051.61705744406
282841.86601465504
282632.34488790645
282423.05327244103
282213.99076449854
282005.15696131374
281796.55146111554
281588.1738631193
281380.02376752894
281172.1007755289
280964.40448928607
280756.93451194343
280549.690447619
280342.67190140096
280135.87847934617
279929.3097884775
279722.9654367788
279516.8450331947
279310.94818762643
279105.2745109285
278899.8236149058
278694.5951123122
278489.5886168453
278284.8037431467
278080.2401067955
277875.8973243088
277671.7

212079.60204387133
211939.30809245168
211799.13574218313
211659.08482941435
211519.1551908085
211379.34666334203
211239.6590843066
211100.0922913036
210960.64612224686
210821.3204153623
210682.11500918472
210543.02974255703
210404.06445463301
210265.21898487152
210126.49317304007
209987.88685921257
209849.399883766
209711.0320873847
209572.78331105446
209434.65339606575
209296.64218401077
209158.74951678334
209020.97523657733
208883.31918588848
208745.78120750992
208608.3611445347
208471.0588403523
208333.87413865083
208196.80688341334
208059.85691892006
207923.0240897443
207786.30824075436
207649.70921711286
207513.226864273
207376.86102798156
207240.61155427634
207104.47828948553
206968.46108022606
206832.5597734055
206696.77421621935
206561.10425615046
206425.54974096792
206290.1105187288
206154.78643777358
206019.5773467296
205884.4830945071
205749.5035303003
205614.6385035848
205479.8878641203
205345.25146194614
205210.72914738383
205076.32077103294
204942.0261837735
204807.845236

In [91]:
for i in range(len(results)):
    rand_recall = random_results[i]
    trained_recall = results[i]
    print(f"rand vs traind: {rand_recall} vs {trained_recall}")

rand vs traind: 0.7694607157050783 vs 0.744087996985114
rand vs traind: 0.7714663143989432 vs 0.7497821222470852
rand vs traind: 0.773713298791019 vs 0.7441340261434862
rand vs traind: 0.7704357437695376 vs 0.7496018067276834
rand vs traind: 0.770646482451138 vs 0.7492011645246042


In [92]:
# obliczamy, w ilu probach wytrenowany model okazal sie lepszy od losowego
# przeprowadzamy test statystyczny - jak prawdopodobne jest to, by k pozytywnych prob bylo dzielem przypadku

def possibility_of_at_least_k_successes_in_n(k, n):
    p = 0.0
    # obliczamy kolejno prawdopodobienstwo k sukcesow, k+1 sukcesow, ...
    # przydadza Ci sie funkcje marh.comb() i math.pow()
    for i in range(k):
        
        math.comb(i, n) * math.pow(.5, n)
    
    return p

p = 0.05
metric = 'recall'

positive_tests_count = 0 # w ilu przypadkach okazalismy sie lepsi niz random?
for i in range(len(results)):
    if results[i] > random_results[i]:
        positive_tests_count += 1

if possibility_of_at_least_k_successes_in_n(positive_tests_count, n_tests) <= p:
    print('We are better than random!')
else:
    print('There is no evidence we are better')

We are better than random!
