In [1]:
import pandas as pd
import numpy as np
from typing import Union

In [4]:
data = {
    "user_id": [0, 0, 1, 2, 3, 4, 0 ],
    "item_id": [2994, 3002, 2994, 1488, 3002, 3002, 1488],
    "rating": [8, 2, 7, 3, 2, 5, 3],
    "datetime": pd.to_datetime([
        "2000-11-30 23:49:23",
        "2000-11-30 23:52:33",
        "2000-11-30 23:57:00",
        "2000-12-01 10:15:45",
        "2000-12-02 14:22:10",
        "2000-12-03 18:30:55",
        "2000-12-04 18:30:55" 
    ])
}



df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,user_id,item_id,rating,datetime
0,0,2994,8,2000-11-30 23:49:23
1,0,3002,2,2000-11-30 23:52:33
2,1,2994,7,2000-11-30 23:57:00
3,2,1488,3,2000-12-01 10:15:45
4,3,3002,2,2000-12-02 14:22:10
5,4,3002,5,2000-12-03 18:30:55
6,0,1488,3,2000-12-04 18:30:55


In [5]:
df["item_id"], _ = pd.factorize(df["item_id"])
df_train = df.copy()
df_train.head(6)
# делаем так как 
# Размер разреженной матрицы coo_array определяется диапазоном индексов, 
# встречающихся в координатах (user_id и item_id).

Unnamed: 0,user_id,item_id,rating,datetime
0,0,0,8,2000-11-30 23:49:23
1,0,1,2,2000-11-30 23:52:33
2,1,0,7,2000-11-30 23:57:00
3,2,2,3,2000-12-01 10:15:45
4,3,1,2,2000-12-02 14:22:10
5,4,1,5,2000-12-03 18:30:55


In [6]:
df_matrix = df_train.pivot(index='user_id', columns='item_id', values='rating')
df_matrix = df_matrix.fillna(0)
df_matrix

item_id,0,1,2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8.0,2.0,3.0
1,7.0,0.0,0.0
2,0.0,0.0,3.0
3,0.0,2.0,0.0
4,0.0,5.0,0.0


In [7]:
np_matrix = df_matrix.to_numpy()
print(f"{np_matrix}")

user_ids = df_matrix.index.to_numpy()
print(f"\n{user_ids}")
item_ids = df_matrix.columns.to_numpy()
print(f"{item_ids}")

interactions = np_matrix
interactions

[[8. 2. 3.]
 [7. 0. 0.]
 [0. 0. 3.]
 [0. 2. 0.]
 [0. 5. 0.]]

[0 1 2 3 4]
[0 1 2]


array([[8., 2., 3.],
       [7., 0., 0.],
       [0., 0., 3.],
       [0., 2., 0.],
       [0., 5., 0.]])

In [8]:
from scipy.sparse import coo_array
coo_interactions = coo_array((df_train["rating"], (df_train["user_id"], df_train["item_id"])))
print(coo_interactions)
coo_interactions.toarray()

<COOrdinate sparse array of dtype 'int64'
	with 7 stored elements and shape (5, 3)>
  Coords	Values
  (0, 0)	8
  (0, 1)	2
  (1, 0)	7
  (2, 2)	3
  (3, 1)	2
  (4, 1)	5
  (0, 2)	3


array([[8, 2, 3],
       [7, 0, 0],
       [0, 0, 3],
       [0, 2, 0],
       [0, 5, 0]])

In [9]:
min(interactions.shape), min(coo_interactions.shape)

(3, 3)

In [10]:
assert (interactions != 0).sum() == df_train.shape[0]
#assert interactions[0, 2994] == 8
#assert interactions[2369, 1203] == 5
#assert interactions[1557, 459] == 3
assert np.allclose(coo_interactions.toarray(), interactions)

In [11]:
from scipy.sparse.linalg import svds

U, S, V = svds(interactions, k=2)

print("U:\n", U)
print("\nS:\n", S)
print("\nV:\n", V)

U:
 [[-0.0903193  -0.78834916]
 [ 0.26640794 -0.60515765]
 [-0.03868151 -0.06281547]
 [-0.35610401 -0.03392494]
 [-0.89026002 -0.08481236]]

S:
 [ 5.47855156 11.04319482]

V:
 [[ 0.20850424 -0.97546709 -0.07063955]
 [-0.95469626 -0.18731989 -0.23122782]]


In [12]:
S_diag = np.diag(S)

rec = U @ S_diag @ V
rec

array([[ 8.20831199,  2.11346656,  2.04799854],
       [ 6.6844328 , -0.17188796,  1.44216584],
       [ 0.61807107,  0.33666039,  0.17536876],
       [-0.04911085,  1.97324955,  0.22444026],
       [-0.12277712,  4.93312389,  0.56110065]])

In [None]:
from scipy.sparse.linalg import svds

U, S, V = svds(coo_interactions, k=2)

print("U:\n", U)
print("\nS:\n", S)
print("\nV:\n", V)