Wczytanie danych

In [None]:
import pandas as pd

columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)

print(ratings.head())


ratings = ratings.drop('timestamp', axis=1)

num_users = ratings['user_id'].nunique()
num_items = ratings['item_id'].nunique()
print(f"Liczba użytkowników: {num_users}, Liczba filmów: {num_items}")


   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
Liczba użytkowników: 943, Liczba filmów: 1682


Tworzenie macierzy użytkownik-film:

In [3]:
user_item_matrix = ratings.pivot(index='user_id', columns='item_id', values='rating')

# Podgląd macierzy
print(user_item_matrix.head())

# Zapisanie macierzy do pliku CSV
user_item_matrix.to_csv('user_item_matrix.csv')


item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         4.0   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

item_id  1673  1674  1675  1676  1677  1678  1679  1680  1681  1682  
user_id                                                              
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
5         NaN   NaN   NaN   NaN   NaN   NaN   N

Sprawdzenie danych:

In [4]:
print(user_item_matrix.isna().sum().sum())


1486126


Opcjonalna normlaizacja ocen

In [5]:
user_item_matrix_normalized = user_item_matrix.subtract(user_item_matrix.mean(axis=1), axis=0)


Przykład faktoryzacji macierzy

In [None]:
import numpy as np

# Ustawienia
num_users, num_items = user_item_matrix.shape
num_features = 10  # Liczba ukrytych wymiarów
learning_rate = 0.01
reg_param = 0.1  # Parametr regularyzacji
epochs = 100

# Inicjalizacja macierzy P i Q
P = np.random.normal(scale=1./num_features, size=(num_users, num_features))
Q = np.random.normal(scale=1./num_features, size=(num_items, num_features))

# Konwersja macierzy na NumPy
ratings = user_item_matrix.to_numpy()

# SGD
for epoch in range(epochs):
    for i in range(num_users):
        for j in range(num_items):
            if not np.isnan(ratings[i, j]):
                error = ratings[i, j] - np.dot(P[i, :], Q[j, :].T)
                for k in range(num_features):
                    P[i, k] += learning_rate * (error * Q[j, k] - reg_param * P[i, k])
                    Q[j, k] += learning_rate * (error * P[i, k] - reg_param * Q[j, k])

    # Obliczanie straty 
    loss = 0
    for i in range(num_users):
        for j in range(num_items):
            if not np.isnan(ratings[i, j]):
                loss += (ratings[i, j] - np.dot(P[i, :], Q[j, :].T))**2
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")

# Wynikowa macierz
predicted_ratings = np.dot(P, Q.T)


Epoch 1/100, Loss: 495926.8170
Epoch 2/100, Loss: 121979.9677
Epoch 3/100, Loss: 99580.0068
Epoch 4/100, Loss: 94574.3085
Epoch 5/100, Loss: 92290.0073
Epoch 6/100, Loss: 90913.9078
Epoch 7/100, Loss: 89882.2622
Epoch 8/100, Loss: 88973.3422
Epoch 9/100, Loss: 88087.9097
Epoch 10/100, Loss: 87187.5830
Epoch 11/100, Loss: 86270.6823
Epoch 12/100, Loss: 85354.0916
Epoch 13/100, Loss: 84457.0800
Epoch 14/100, Loss: 83591.8038
Epoch 15/100, Loss: 82761.8642
Epoch 16/100, Loss: 81965.2196
Epoch 17/100, Loss: 81197.4979
Epoch 18/100, Loss: 80454.1103
Epoch 19/100, Loss: 79731.2259
Epoch 20/100, Loss: 79026.0898
Epoch 21/100, Loss: 78337.0452
Epoch 22/100, Loss: 77663.4341
Epoch 23/100, Loss: 77005.4398
Epoch 24/100, Loss: 76363.8906
Epoch 25/100, Loss: 75740.0408
Epoch 26/100, Loss: 75135.3458
Epoch 27/100, Loss: 74551.2557
Epoch 28/100, Loss: 73989.0445
Epoch 29/100, Loss: 73449.6917
Epoch 30/100, Loss: 72933.8173
Epoch 31/100, Loss: 72441.6688
Epoch 32/100, Loss: 71973.1464


Dodano kod testujący

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# Wczytanie danych
ratings_train = pd.read_csv('ml-100k/u1.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
ratings_test = pd.read_csv('ml-100k/u1.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Parametry macierzy
num_users = ratings_train['user_id'].nunique()
num_items = ratings_train['item_id'].nunique()
num_features = 10  # Liczba ukrytych wymiarów
learning_rate = 0.01
reg_param = 0.1  # Parametr regularyzacji
epochs = 100

# Tworzenie macierzy użytkownik-film
train_matrix = ratings_train.pivot(index='user_id', columns='item_id', values='rating')
train_matrix = train_matrix.to_numpy()

# Inicjalizacja macierzy P i Q
P = np.random.normal(scale=1./num_features, size=(num_users, num_features))
Q = np.random.normal(scale=1./num_features, size=(num_items, num_features))

# Faktoryzacja macierzy (SGD)
for epoch in range(epochs):
    for i in range(num_users):
        for j in range(num_items):
            if not np.isnan(train_matrix[i, j]):  # Pomijamy brakujące oceny
                error = train_matrix[i, j] - np.dot(P[i, :], Q[j, :].T)
                for k in range(num_features):
                    P[i, k] += learning_rate * (error * Q[j, k] - reg_param * P[i, k])
                    Q[j, k] += learning_rate * (error * P[i, k] - reg_param * Q[j, k])

# Wynikowa macierz
predicted_ratings = np.dot(P, Q.T)

# Sprawdzanie jakości na zbiorze testowym
test_ratings = []
predicted_test_ratings = []

for _, row in ratings_test.iterrows():
    user = row['user_id'] - 1  # Indeksy w zbiorze zaczynają się od 1
    item = row['item_id'] - 1
    true_rating = row['rating']
    predicted_rating = predicted_ratings[user, item]
    
    test_ratings.append(true_rating)
    predicted_test_ratings.append(predicted_rating)

# Obliczanie błędu MSE
mse = mean_squared_error(test_ratings, predicted_test_ratings)
print(f"Mean Squared Error (MSE) na zbiorze testowym: {mse:.4f}")
