In [1]:
from scipy.sparse import csr
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm

  from ._conv import register_converters as _register_converters


### 将列表转为稀疏矩阵
Here we created a utility function to create a sparse matrix (that is needed by factorization machines) from a list of user/item ids.

Check this gist for more details about this utitly function.

In [5]:
###########################################################################
# 数据转换
def vectorize_dic(dic, ix=None, p=None, n=0, g=0):
    '''
    对列表中对每一列（每个内部列表是一组与特征对应的值）创建一个csr矩阵。

    parameters:
    -----------
    dic -- 特征列表的字典, 即字典的key为特征名
    ix -- 生成器索引(default None)
    p -- 特征空间的维度(稀疏空间的特征数目) (default None)
    '''
    if ix == None:
        ix = dict()

    # 矩阵大小
    nz = n * g

    # 列索引
    col_ix = np.empty(nz, dtype=int)

    i = 0
    for k, lis in dic.items():
        for j in range(len(lis)):
            # 将索引el附加到k以防止将具有相同ID的不同列映射到相同索引
            ix[str(lis[j]) + str(k)] = ix.get(str(lis[j]) + str(k), 0) + 1
            col_ix[i+j*g] = ix[str(lis[j]) + str(k)]
        i += 1


    # 行索引, shape=(n*g, ), 比如n=7, g=3, 则将[0~7]*3
    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)
    # 特征维数为None时
    if p == None:
        p = len(ix)
    # 选择
    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n,p)), ix

### 载入数据
数据集为MovieLens100k Dataset，将其转为稀疏矩阵。

In [6]:
cols = ['user', 'item', 'rating', 'timestamp']
train = pd.read_csv('data/ua.base', delimiter='\t', names=cols)
test = pd.read_csv('data/ua.test', delimiter='\t', names=cols)

# 矢量化数据，并转为csr矩阵
X_train, ix = vectorize_dic({'users': train['user'].values,
                             'items': train['item'].values}, n=len(train.index), g=2)
X_test, ix = vectorize_dic({'users': test['user'].values,
                            'items': test['item'].values}, ix, X_train.shape[1], n=len(test.index), g=2)
y_train = train.rating.values
y_test = test.rating.values

### 定义输入矩阵
将X_train和X_test转为稀疏矩阵，用于tf模型的训练。对于大的数据集，这种方法不推荐，tf.SparseTensor可用于大的稀疏数据集。

In [7]:
# 转为稀疏矩阵
X_train = X_train.todense()
X_test = X_test.todense()
print(X_train.shape)
print(X_test.shape)

(90570, 2623)
(9430, 2623)


### 使用tensorflow定义FM模型
首先初始化模型参数

In [8]:
n, p = X_train.shape
k = 10
# 此函数可以理解为形参，用于定义过程，在执行的时候再赋具体的值
# 确定数据的大小, 维度多少
X = tf.placeholder('float', [None, p])
y = tf.placeholder('float', [None, 1])

# 偏差
w0 = tf.Variable(tf.zeros([1]))
# 权重, 每个变量的权重参数
w = tf.Variable(tf.zeros([p]))
# 两两变量组合的权重参数
v = tf.Variable(tf.random_normal([k, p], mean=0, stddev=0.01))

### 定义输出y值如何计算
给定特征向量x，根据下式计算y值的输出，具体推导可看Factorization Machines Note.ipynb：

$$\hat{y}(\mathbf{x}) = w_0 + \sum_{j=1}^{p}w_jx_j + \frac{1}{2} \sum_{f=1}^{k} ((\sum_{j=1}^{p}v_{j,f}x_j)^2-\sum_{j=1}^{p}v_{j,f}^2 x_j^2)'$$
下面对cell是对上式的实现：

In [11]:
# w和x相乘, 然后使用reduce_sum, 沿着某个维度求和
# 如a = [[1, 1, 1], [1, 1, 1]],
# tf.reduce_sum(a, 1, keep_dims=True) = [[3], [3]], shape=(2, 1)
# tf.reduce_sum(a, 0, keep_dims=True) = [2, 2, 2], shape=(1, 3)
# 这里大小为(n, 1), 其中n为样本数(行数)
linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(w, X), 1, keepdims=True))
'''
x = [[1, 2], 
     [1, 2]]
y = [[0, 1], 
     [0, 1]]
z1 = tf.multiply(x,y)
z2 = tf.matmul(x, y)
with tf.Session() as sess:
    print(sess.run(z1))
    print(sess.run(z2))

[[0 2]
 [0 2]]

[[0 3]
 [0 3]]
看出multiply是各元素相乘, 而matmul是矩阵相乘, 即点积法
'''

pair_interactions = 0.5 * tf.reduce_sum(
    # (xv)^2 - (x^2 \codt v^2)
    tf.subtract(
        tf.pow(
            # 矩阵相乘, 点积法, 在平方, (xv)^2
            tf.matmul(X, tf.transpose(v)), 2),
        tf.matmul(tf.pow(X, 2), tf.transpose(tf.pow(v, 2)))
    ), axis=1, keepdims=True)

# 等于linear_term + pair_interactions
y_hat = tf.add(linear_terms, pair_interactions)

### 损失函数
使用tensorflow实现FM的损失函数，定义为：

$$L = \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 + \lambda_w ||W||^2 + \lambda_v ||V||^2$$
其中 $\lambda_w$ 和 $\lambda_v$ 是一次项和二次项的正则系数

In [12]:
# 正则项系数
lambda_w = tf.constant(0.001, name='lambda_w')
lambda_v = tf.constant(0.001, name='lambda_v')
# l2正则项, lambda_w * w^2 + lambda_v * v^2
l2_norm = tf.reduce_sum(
    tf.add(
        tf.multiply(lambda_w, tf.pow(w, 2)),
        tf.multiply(lambda_v, tf.pow(v, 2))
    )
)


# 平均误差
error = tf.reduce_mean(tf.square(y - y_hat))
# 带有正则项的损失函数, (y - y_hat)^2 + lambda_w * w^2 + lambda_v * v^2
loss = tf.add(error, l2_norm)

### 操作
给定损失函数，使用梯度下降更新参数：
$$\Theta_{i+1} = \Theta_{i} - \eta \frac{\delta L}{\delta \Theta}$$

In [13]:
train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)

### 使用mini-batch训练

In [14]:
###########################################################################
# 批量梯度下降
def batcher(X, y=None, batch_size=-1):
    n_samples = X.shape[0]

    if batch_size == -1:
        batch_size = n_samples

    # 必须大于0
    if batch_size < 1:
        raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))

    for i in range(0, n_samples, batch_size):
        upper_bound = min(i + batch_size, n_samples)
        ret_x = X[i:upper_bound]
        ret_y = None
        if y is not None:
            ret_y = y[i:i + batch_size]
            yield (ret_x, ret_y)

### 评估模型
在测试集上评估训练模型的效果，使用RMSE来衡量预测的误差。

In [None]:
epochs = 10
batch_size = 1000

# 载入图模型
# 初始化全局变量
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)

    for epoch in tqdm(range(epochs), unit='epoch'):
        perm = np.random.permutation(X_train.shape[0])
        # 批量梯度下降
        for bX, bY in batcher(X_train[perm], y_train[perm], batch_size):
            _, t = sess.run([train_op, loss], feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)})
            print(t)

    # 计算测试集的误差
    errors = []
    for bX, bY in batcher(X_test, y_test):
        error.append(sess.run(error, feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)}))
        print(errors)

    # 均方根误差
    RMSE = np.sqrt(np.array(errors).mean())
    print(RMSE)

13.819626
13.011467
12.761883
12.2216015
11.69805
11.463026
10.84833
11.002898
10.3567915
9.513984
9.658576
9.07826
8.807209
8.562166
8.309016
8.090875
7.5858817
7.4349427
7.108375
7.1056848
6.827699
6.4942474
6.256871
6.052091
5.84168
5.836885
5.3031797
5.378466
5.123978
5.0017066
4.821315
4.630307
4.4924755
4.4469576
4.34924
3.9804752
4.027244
3.986374
3.915722
3.7145824
3.6908128
3.4634523
3.5396767
3.5113034
3.341489
3.1985743
3.037804
2.947446
2.9173622
2.8578804
2.8114247
2.7320924
2.8315914
2.6472843
2.6621807
2.6291716
2.5302184
2.4309812
2.3053846
2.3376706
2.3411977
2.2827067
2.1689844
2.2699885
2.124325
2.110487
2.0663626
2.1768625
2.0495427
1.9921896
1.8808839
1.8600875
1.9584569
1.8720678
1.8639582
1.8836176
1.7474736
1.7285707
1.7198129
1.6765407
1.7514193
1.6685832
1.7711302
1.6782892
1.5630844
1.7193867
1.6817102
1.6433915
1.5960597
1.5973159
1.5576217
