In [3]:



from tqdm import tqdm
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [4]:
RATINGS_DATA_FILE = 'ratings.csv'
MOVIES_DATA_FILE = 'movies.csv'

ratings_data = pd.read_csv(RATINGS_DATA_FILE)
movies_data = pd.read_csv(MOVIES_DATA_FILE)

print("Movies Data:")
print(movies_data.head())

print("\nRatings Data:")
print(ratings_data.head())


Movies Data:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings Data:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [5]:
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy<2.0"])

# 1 Pripravi Surprise dataset za filme
from surprise import Dataset
from surprise import Reader
 
# Get minimum and maximum rating from the dataset
min_rating = ratings_data.rating.min()
max_rating = ratings_data.rating.max()
 
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)


In [6]:
from surprise import KNNBasic

# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)


Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2509b7200b0>

In [12]:
uid = 610  # raw user id (as in the ratings file). 
iid = 168252  # raw item id (as in the ratings file). 

# 
pred = algo.predict(uid, iid, r_ui=4, verbose=True)
# Test predictions for 5 different users
user_ids = [1, 2, 3, 4, 5]
item_id = 168252  # same item for simplicity

for user_id in user_ids:
    print("ijdnidjn")
    pred = algo.predict(user_id, item_id, verbose=True)


user: 610        item: 168252     r_ui = 4.00   est = 4.31   {'actual_k': 16, 'was_impossible': False}
ijdnidjn
user: 1          item: 168252     r_ui = None   est = 4.33   {'actual_k': 16, 'was_impossible': False}
ijdnidjn
user: 2          item: 168252     r_ui = None   est = 4.28   {'actual_k': 15, 'was_impossible': False}
ijdnidjn
user: 3          item: 168252     r_ui = None   est = 4.42   {'actual_k': 8, 'was_impossible': False}
ijdnidjn
user: 4          item: 168252     r_ui = None   est = 4.31   {'actual_k': 14, 'was_impossible': False}
ijdnidjn
user: 5          item: 168252     r_ui = None   est = 4.11   {'actual_k': 16, 'was_impossible': False}


In [13]:
from surprise import KNNBasic, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

# Load the data
ratings_data = pd.read_csv('ratings.csv')
reader = Reader(rating_scale=(ratings_data.rating.min(), ratings_data.rating.max()))
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Test different values of k
for k in [5, 10, 20, 40, 80]:
    algo = KNNBasic(k=k)
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    print(f'k={k}, RMSE={rmse}')

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9554
k=5, RMSE=0.9553642194106577
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9395
k=10, RMSE=0.939491120156738
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9394
k=20, RMSE=0.9394275793949977
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9464
k=40, RMSE=0.946364824513565
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9526
k=80, RMSE=0.9526189458351387


In [11]:
from surprise.model_selection import KFold
from surprise import accuracy, Dataset, SVD

kf = KFold(n_splits=3)

algo = KNNBasic()

for trainset, testset in kf.split(data):

    # učenje
    algo.fit(trainset)
    # napoved
    predictions = algo.test(testset)

    # ocena natančnosti
    accuracy.rmse(predictions, verbose=True)


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9620
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9544
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9568


In [14]:
from surprise import KNNBasic, Dataset, Reader, accuracy
from surprise.model_selection import KFold
import pandas as pd

# Load the data
ratings_data = pd.read_csv('ratings.csv')
reader = Reader(rating_scale=(ratings_data.rating.min(), ratings_data.rating.max()))
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)

kf = KFold(n_splits=3)

# Test different values of k
for k in [5, 10, 20, 40, 80]:
    algo = KNNBasic(k=k)
    
    rmses = []
    all_predictions = []
    
    for trainset, testset in kf.split(data):
        # Train the algorithm
        algo.fit(trainset)
        # Make predictions
        predictions = algo.test(testset)
        all_predictions.extend(predictions)
        # Evaluate accuracy
        rmse = accuracy.rmse(predictions, verbose=True)
        rmses.append(rmse)
    
    avg_rmse = sum(rmses) / len(rmses)
    print(f'k={k}, Average RMSE={avg_rmse}')
    
    # Print predictions object details
    print(f'Number of predictions: {len(all_predictions)}')
    print(f'First prediction: {all_predictions[0]}')

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9685
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9696
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9743
k=5, Average RMSE=0.9708044069856964
Number of predictions: 100836
First prediction: user: 387        item: 1285       r_ui = 3.00   est = 3.27   {'actual_k': 5, 'was_impossible': False}
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9514
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9489
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9592
k=10, Average RMSE=0.953175719421179
Number of predictions: 100836
First prediction: user: 192        item: 225        r_ui = 4.00   est = 3.73   {'actual_k': 10, 'was_impossible': False}
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9578
Computing the msd simil

KeyboardInterrupt: 

In [9]:
# Testiraj natancnost algoritma s krizno validacijo

from surprise import SVD, KNNBasic, accuracy
from surprise.model_selection import cross_validate
 
# Izberi algoritem in parametre
algoritem = SVD(n_epochs=10)
# Izvedi križno validacijo (učenje in testiranje)
results = cross_validate(algoritem, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)



Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8773  0.8614  0.8840  0.8753  0.8690  0.8852  0.8887  0.8740  0.8839  0.8833  0.8782  0.0080  
MAE (testset)     0.6723  0.6676  0.6813  0.6754  0.6695  0.6816  0.6829  0.6731  0.6800  0.6797  0.6763  0.0052  
Fit time          0.59    0.67    0.57    0.60    0.61    0.60    0.66    0.60    0.64    0.63    0.62    0.03    
Test time         0.17    0.03    0.05    0.09    0.05    0.05    0.05    0.12    0.05    0.03    0.07    0.04    


In [16]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate
import pandas as pd
import numpy as np

# Naloži podatke
ratings_data = pd.read_csv('ratings.csv')
reader = Reader(rating_scale=(ratings_data.rating.min(), ratings_data.rating.max()))
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)

# Definiraj SVD algoritma z določenimi parametri
algo = SVD(n_factors=50, n_epochs=20, random_state=42)

# Izvedi križno validacijo
results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, return_train_measures=True)

# Convert lists to numpy arrays
results['fit_time'] = np.array(results['fit_time'])
results['test_time'] = np.array(results['test_time'])

# Izračunaj povprečne vrednosti
avg_rmse = results['test_rmse'].mean()
std_rmse = results['test_rmse'].std()
avg_mae = results['test_mae'].mean()
std_mae = results['test_mae'].std()
avg_fit_time = results['fit_time'].mean()
std_fit_time = results['fit_time'].std()
avg_test_time = results['test_time'].mean()
std_test_time = results['test_time'].std()

# Prikaz rezultatov
print(f"Evaluating RMSE, MAE of algorithm SVD on 10 split(s).\n")
print(f"{'':25}Fold 1    Fold 2    Fold 3    Fold 4    Fold 5    Fold 6    Fold 7    Fold 8    Fold 9    Fold 10   Mean     Std")
print(f"RMSE (testset)    {results['test_rmse'].round(4).tolist()}  {avg_rmse:.4f}  {std_rmse:.4f}")
print(f"MAE (testset)     {results['test_mae'].round(4).tolist()}  {avg_mae:.4f}  {std_mae:.4f}")
print(f"Fit time          {results['fit_time'].round(2).tolist()}   {avg_fit_time:.2f}    {std_fit_time:.2f}")
print(f"Test time         {results['test_time'].round(2).tolist()}    {avg_test_time:.2f}    {std_test_time:.2f}")

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                         Fold 1    Fold 2    Fold 3    Fold 4    Fold 5    Fold 6    Fold 7    Fold 8    Fold 9    Fold 10   Mean     Std
RMSE (testset)    [0.8634, 0.8774, 0.8616, 0.8612, 0.8645, 0.8676, 0.8649, 0.8603, 0.8737, 0.8576]  0.8652  0.0058
MAE (testset)     [0.6626, 0.6729, 0.664, 0.6616, 0.6693, 0.6684, 0.6628, 0.6588, 0.6701, 0.6588]  0.6649  0.0047
Fit time          [0.98, 1.03, 0.9, 0.97, 0.75, 0.77, 0.75, 0.77, 0.77, 0.8]   0.85    0.10
Test time         [0.05, 0.06, 0.06, 0.05, 0.06, 0.03, 0.03, 0.03, 0.05, 0.08]    0.05    0.01
