In [1]:
import numpy as np
import pandas as pd
from math import sqrt

In [2]:
df = pd.DataFrame(
    {'a':[4,0,2],
    'b':[5,3,0],
    'c':[0,4,1],
    'd':[5,3,3],
    'e':[1,1,0],
    'f':[0,2,4],
    'g':[3,1,5],
    'h':[2,0,3]},
  columns =['a','b','c','d','e','f','g','h'])
df

Unnamed: 0,a,b,c,d,e,f,g,h
0,4,5,0,5,1,0,3,2
1,0,3,4,3,1,2,1,0
2,2,0,1,3,0,4,5,3


In [3]:
# 将上述矩阵看成布尔矩阵，计算每对用户之间的Jaccard距离
data1 = df.copy()
data1[ data1 > 0 ] = 1
data1

Unnamed: 0,a,b,c,d,e,f,g,h
0,1,1,0,1,1,0,1,1
1,0,1,1,1,1,1,1,0
2,1,0,1,1,0,1,1,1


In [4]:
def calc_jaccard(a_index, b_index,data):
    return sum(data.loc[a_index]&data.loc[b_index])/sum(data.loc[a_index]|data.loc[b_index])
jac1 = calc_jaccard(0,1,data1)
print("Jaccard distance A - B: ",jac1)
jac2 = calc_jaccard(2,1,data1)
print("Jaccard distance B - C: ",jac2)
jac3 = calc_jaccard(0,2,data1)
print("Jaccard distance A - C: ",jac3)


Jaccard distance A - B:  0.5
Jaccard distance B - C:  0.5
Jaccard distance A - C:  0.5


In [5]:
# 将上述矩阵看成布尔矩阵，计算每对用户之间的余弦距离
data2 = df.copy()
data2[ data2 > 0 ] = 1
data2

Unnamed: 0,a,b,c,d,e,f,g,h
0,1,1,0,1,1,0,1,1
1,0,1,1,1,1,1,1,0
2,1,0,1,1,0,1,1,1


In [6]:
def calc_cos(a_index, b_index,data):
    return sum(data.loc[a_index]*data.loc[b_index])/(sqrt(sum(data.loc[a_index]))*sqrt(sum(data.loc[b_index])))
cos1 = calc_cos(0,1,data2)
print("Cosine distance A - B: ",cos1)
cos2 = calc_cos(2,1,data2)
print("Cosine distance B - C: ",cos2)
cos3 = calc_cos(0,2,data2)
print("Cosine distance A - C: ",cos3)

Cosine distance A - B:  0.6666666666666667
Cosine distance B - C:  0.6666666666666667
Cosine distance A - C:  0.6666666666666667


In [7]:
# 将评分3-5看成1，1-2和空白看成0。计算每对用户之间的Jaccard距离
data3 = df.copy()
data3[ data3 <= 2 ] = 0
data3[ data3 >= 3 ] = 1
data3

Unnamed: 0,a,b,c,d,e,f,g,h
0,1,1,0,1,0,0,1,0
1,0,1,1,1,0,0,0,0
2,0,0,0,1,0,1,1,1


In [8]:
jac1 = calc_jaccard(0,1,data3)
print("Jaccard distance A - B: ",jac1)
jac2 = calc_jaccard(2,1,data3)
print("Jaccard distance B - C: ",jac2)
jac3 = calc_jaccard(0,2,data3)
print("Jaccard distance A - C: ",jac3)

Jaccard distance A - B:  0.4
Jaccard distance B - C:  0.16666666666666666
Jaccard distance A - C:  0.3333333333333333


In [9]:
#一样对矩阵进行处理，计算每对用户之间的余弦距离
data4 = df.copy()
data4[ data4 <= 2 ] = 0
data4[ data4 >= 3 ] = 1
data4

Unnamed: 0,a,b,c,d,e,f,g,h
0,1,1,0,1,0,0,1,0
1,0,1,1,1,0,0,0,0
2,0,0,0,1,0,1,1,1


In [10]:
cos1 = calc_cos(0,1,data4)
print("Cosine distance A - B: ",cos1)
cos2 = calc_cos(2,1,data4)
print("Cosine distance B - C: ",cos2)
cos3 = calc_cos(0,2,data4)
print("Cosine distance A - C: ",cos3)

Cosine distance A - B:  0.5773502691896258
Cosine distance B - C:  0.2886751345948129
Cosine distance A - C:  0.5


In [11]:
# 通过减去用户非空评分的平均值对矩阵进行归一化
data5 = df.copy()
data5[ data5 == 0 ] = np.nan
df_norm = (data5 - data5.min()) / (data5.max() - data5.min())
df_norm=df_norm.fillna(0)
df_norm

Unnamed: 0,a,b,c,d,e,f,g,h
0,1.0,1.0,0.0,1.0,0.0,0.0,0.5,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [12]:
cos1 = calc_cos(0,1,df_norm)
print("Cosine distance A - B: ",cos1)
cos2 = calc_cos(2,1,df_norm)
print("Cosine distance B - C: ",cos2)
cos3 = calc_cos(0,2,df_norm)
print("Cosine distance A - C: ",cos3)

Cosine distance A - B:  0.0
Cosine distance B - C:  0.0
Cosine distance A - C:  0.15430334996209194


In [13]:
# 对上述矩阵的项进行聚类处理
# 将8个项进行层次聚类，得到4个簇。聚类时采用下列做法。将评分3、4、5换成1，评分1、2、空白换成0。
# 使用Jaccard距离来计算上述处理结果列向量之间的距离。
# 对于不止一个元素的簇，簇之间的距离定义为两个簇中的元素之间的最短距离。
df2 = df.copy()
df2 [ df2 <= 2 ] = 0
df2 [ df2 >= 3 ] = 1
df2

Unnamed: 0,a,b,c,d,e,f,g,h
0,1,1,0,1,0,0,1,0
1,0,1,1,1,0,0,0,0
2,0,0,0,1,0,1,1,1


In [14]:
def calc_jaccard_t(a, b):
    up = (a[0]&b[0])+(a[1]&b[1])+(a[2]&b[2])
    down = (a[0]|b[0])+(a[1]|b[1])+(a[2]|b[2])
    if down==0:
        return np.nan
    return up/down

x = []
for i in df2:
    x.append(tuple(df2[i]))
    
y=np.zeros((len(x),len(x)))
for i in range(1,len(x)):
    for j in range(i+1,len(x)):
        tmp = calc_jaccard_t(x[i],x[j])
        y[i][j]=tmp
        y[j][i]=tmp
y

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.5       , 0.66666667, 0.        ,
        0.        , 0.33333333, 0.        ],
       [0.        , 0.5       , 0.        , 0.33333333, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.66666667, 0.33333333, 0.        , 0.        ,
        0.33333333, 0.66666667, 0.33333333],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.33333333, 0.        ,
        0.        , 0.5       , 1.        ],
       [0.        , 0.33333333, 0.        , 0.66666667, 0.        ,
        0.5       , 0.        , 0.5       ],
       [0.        , 0.        , 0.        , 0.33333333, 0.        ,
        1.        , 0.5       , 0.        ]])

In [15]:
from sklearn.cluster import AgglomerativeClustering

In [16]:
clf = AgglomerativeClustering(n_clusters=4, affinity='precomputed', linkage='average')
labels = clf.fit_predict(y)
labels

array([0, 0, 1, 3, 0, 1, 2, 0])

In [17]:
stat = {}
for i in range(len(labels)):
#     print(i,labels[i],list(x[i]))
#     print(stat)
    if labels[i] not in stat:
        stat[labels[i]]={"sum":list(x[i]),"ct":1}
    else:
        tmp = list(x[i])
        c=[]
        for p,q in zip(stat[labels[i]]["sum"],tmp):  
            summ=p+q  
            c.append(summ)  
        stat[labels[i]]["sum"]=c
        stat[labels[i]]["ct"]+=1
stat

{0: {'ct': 4, 'sum': [2, 1, 1]},
 1: {'ct': 2, 'sum': [0, 1, 1]},
 2: {'ct': 1, 'sum': [1, 0, 1]},
 3: {'ct': 1, 'sum': [1, 1, 1]}}

In [18]:
# 基于原始矩阵构造一个新的矩阵，其中的行像以往一样对应用户，而列则对应簇。
# 计算用户和项簇对应的元素，即用户对簇中所有项非空元素评分的平均值。
df3 = pd.DataFrame(
    {'A':np.array(stat[labels[0]]["sum"]) /stat[labels[0]]["ct"],
    'B':np.array(stat[labels[1]]["sum"])/stat[labels[1]]["ct"],
    'C':np.array(stat[labels[2]]["sum"])/stat[labels[2]]["ct"],
    'D':np.array(stat[labels[3]]["sum"])/stat[labels[3]]["ct"]},
  columns =['A','B','C','D'])
df3

Unnamed: 0,A,B,C,D
0,0.5,0.5,0.0,1.0
1,0.25,0.25,0.5,1.0
2,0.25,0.25,0.5,1.0


In [19]:
# 得到的矩阵计算每对用户之间的余弦距离。
cos1 = calc_cos(0,1,df3)
print("Cosine distance A - B: ",cos1)
cos2 = calc_cos(2,1,df3)
print("Cosine distance B - C: ",cos2)
cos3 = calc_cos(0,2,df3)
print("Cosine distance A - C: ",cos3)

Cosine distance A - B:  0.6249999999999999
Cosine distance B - C:  0.6874999999999999
Cosine distance A - C:  0.6249999999999999
