# 協調フィルタリング

## アイテムベース

In [1]:
import numpy as np

In [2]:
user = 943
item = 1682
rating = 100000

In [3]:
A = np.zeros((user, item))
with open('ml-100k/u1.base') as f:
    for line in f:
        uid, iid, rat, _ = map(int, line.split())
        A[uid-1, iid-1] = rat

In [17]:
def cos_sim_matrix(matrix):
    norm = np.linalg.norm(matrix, axis=0)
    norm[norm == 0] = 1.
    normed_matrix = matrix / norm[None, :]
    return normed_matrix.T @ normed_matrix

In [36]:
sim = cos_sim_matrix(A[1:])
a = sim @ A[0]
b = sim @ A[0].clip(0, 1)
rec = np.divide(a, b, out=np.zeros_like(a), where=b!=0)
for i, id_ in zip(range(100), np.argsort(rec)[::-1]):
    print(i+1, id_+1, rec[id_])

1 1342 5.0
2 1414 4.7375698473
3 1259 4.50482970357
4 1347 4.40762231326
5 1654 4.35524352723
6 1618 4.33608325801
7 1354 4.33489193554
8 1500 4.33387778256
9 1332 4.30638160291
10 1323 4.27486096095
11 1674 4.27450574479
12 1461 4.26506102176
13 1460 4.26506102176
14 1447 4.26506102176
15 1453 4.26506102176
16 1450 4.26506102176
17 1452 4.26506102176
18 868 4.23698780003
19 1298 4.23020209238
20 1656 4.20580323717
21 1234 4.19659344325
22 1369 4.1962851263
23 1360 4.19530057309
24 1593 4.19530057309
25 1431 4.18190901886
26 1512 4.16341474966
27 1398 4.15389742181
28 1485 4.15238582414
29 1317 4.14439418258
30 863 4.14191809025
31 1064 4.13461393482
32 119 4.12759189112
33 1448 4.12741749771
34 1080 4.12588161744
35 1333 4.1211778757
36 1678 4.12057032724
37 1679 4.12057032724
38 1680 4.12057032724
39 1596 4.12012525304
40 1595 4.12012525304
41 1604 4.11890383504
42 1068 4.11805961132
43 1377 4.11568688705
44 1251 4.11426729365
45 1171 4.11237396777
46 1639 4.11157238322
47 19 4.11048

In [37]:
with open('ml-100k/u1.test') as f:
    for line in f:
        uid, iid, rat, _ = map(int, line.split())
        if uid == 1:
            print(iid, rat, rec[iid-1])

6 5 4.04806108467
10 3 3.97869006185
12 5 3.89051270562
14 5 4.0269755961
17 3 3.78029738214
20 4 4.02790210664
23 4 3.92660845381
24 3 3.84025363171
27 2 3.77854135478
31 3 3.82722948678
33 4 3.7579220913
36 2 3.65985729249
39 4 3.82572884806
44 5 3.80681917392
47 4 3.86954050125
49 3 3.71678951399
51 4 3.68894370594
53 3 3.80597935803
54 3 3.72993782233
56 4 3.87324438493
60 5 4.04338785378
61 4 3.98579314889
62 3 3.69525616857
64 5 3.87876153793
65 4 3.84806623481
67 3 3.65051705491
69 3 3.82737908327
70 3 3.87706590151
72 4 3.71289593465
73 3 3.70469600334
74 1 3.7535440679
76 4 3.80205826271
78 1 3.36128332579
80 4 3.64966836179
81 5 3.87584427512
82 5 3.75904472109
84 4 3.509974806
85 3 3.71906427918
86 5 3.92172739393
90 4 3.70457584224
91 5 3.76645420962
92 3 3.85594352525
96 5 3.83294946303
97 3 3.85607272763
98 4 3.88501459463
100 5 3.92793112193
102 2 3.59432807454
103 1 3.74833516507
104 1 3.7178591339
107 4 3.75854370469
108 5 3.85535619902
112 1 3.34378722071
113 5 4.0519

In [5]:
def cos_sim_matrix2(matrix):
    item = matrix.shape[1]
    sim = np.zeros((item, item))
    for i in range(item):
        for j in range(i, item):
            # アイテムiとアイテムjの類似度を計算
            common = (matrix[:, i] * matrix[:, j]).astype(np.bool)
            if np.any(common):
                x1 = matrix[common, i]
                x2 = matrix[common, j]
                sim[i, j] = x1 @ x2 / np.linalg.norm(x1) / np.linalg.norm(x2)
                sim[j, i] = sim[i, j]
    
    
    return sim

In [38]:
sim = cos_sim_matrix2(A[1:])
a = sim @ A[0]
b = sim @ A[0].clip(0, 1)
rec = np.divide(a, b, out=np.zeros_like(a), where=b!=0)
for i, id_ in zip(range(100), np.argsort(rec)[::-1]):
    print(i+1, id_+1, rec[id_])

1 1342 5.0
2 1414 4.75
3 1259 4.53324167711
4 1354 4.5
5 1654 4.4375
6 1500 4.4
7 1618 4.4
8 1332 4.35294117647
9 1234 4.34809488936
10 1347 4.32757481903
11 868 4.25594098644
12 1680 4.25
13 1679 4.25
14 1678 4.25
15 1674 4.25
16 1450 4.23529411765
17 1447 4.23529411765
18 1452 4.23529411765
19 1460 4.23529411765
20 1461 4.23529411765
21 1453 4.23529411765
22 1593 4.20689655172
23 1360 4.20689655172
24 1463 4.17785805027
25 1323 4.17129535344
26 1398 4.17073170732
27 1064 4.15583248693
28 1656 4.15568387751
29 1431 4.1541680767
30 1374 4.15
31 1604 4.14634146341
32 1601 4.14285714286
33 1677 4.14285714286
34 1333 4.12273499004
35 1367 4.12262349755
36 1596 4.12121212121
37 1595 4.12121212121
38 119 4.11094025778
39 1295 4.1099402387
40 1316 4.09702485018
41 1513 4.09341004681
42 1212 4.0928463552
43 1294 4.09240778128
44 1673 4.08695652174
45 314 4.07869168991
46 1318 4.07692307692
47 1485 4.07273827384
48 1369 4.07243027551
49 1597 4.07238393353
50 1377 4.0671130087
51 1362 4.0625
52

In [39]:
with open('ml-100k/u1.test') as f:
    for line in f:
        uid, iid, rat, _ = map(int, line.split())
        if uid == 1:
            print(iid, rat, rec[iid-1])

6 5 3.79992043832
10 3 3.75073172707
12 5 3.68581050353
14 5 3.74114126
17 3 3.6672182086
20 4 3.76296718728
23 4 3.70399174088
24 3 3.68097524888
27 2 3.67926680511
31 3 3.6790283447
33 4 3.69709236717
36 2 3.73697818812
39 4 3.68210574625
44 5 3.72555011111
47 4 3.75056206367
49 3 3.70470721249
51 4 3.68589146224
53 3 3.69435029204
54 3 3.65662219366
56 4 3.70159431657
60 5 3.71949443312
61 4 3.74013991361
62 3 3.70355380962
64 5 3.69078318576
65 4 3.71532199023
67 3 3.66842203311
69 3 3.6911646995
70 3 3.70440801241
72 4 3.68208338668
73 3 3.68585899408
74 1 3.81324786741
76 4 3.72725538873
78 1 3.70627902867
80 4 3.70220115705
81 5 3.72587910305
82 5 3.68839747284
84 4 3.67881875786
85 3 3.6723078058
86 5 3.72352688307
90 4 3.70613513268
91 5 3.6929548751
92 3 3.69888827562
96 5 3.68216315112
97 3 3.69291624772
98 4 3.69478572271
100 5 3.70677045569
102 2 3.7535797184
103 1 3.74031405222
104 1 3.60816927814
107 4 3.75226390559
108 5 3.72084225212
112 1 3.6758053557
113 5 3.92419843