# 0.检查本地准备的文件👀

- 👉这里是[Amazon数据集](http://jmcauley.ucsd.edu/data/amazon/)和数据文件格式使用说明<br>
- notice🚩：找到了让内存吃紧的元凶：因为本文中采用了Leave one out交叉验证，所以造成了训练成本巨大。





In [0]:
!ls

adc.json  movies_visual_features  reviews_desc.csv  sample_data


# 1.加载数据，构造矩阵📕
从本地中加载reivew+nvf数据，并构造基本的矩阵

In [0]:
from collections import defaultdict
import os 
import numpy as np #for scientific computing
import pandas as pd #for operating data
import random

#load reviews+nvf(non-visual-features) data from raw amazon reviews csv files
def load_nvf(data_path):
  review_nvf = pd.read_csv(data_path)
  print("reviews with non_visual_features loaded..")
  return review_nvf

# transfering source data into matrix for use
# here is not all the data but sampling for some by rate
def load_data_hybrid(review_nvf, min_items=1, min_users=1,\
                     sampling=True, sample_size = 0.5, data_size = 500000):
  
  user_ratings = defaultdict(set) # no repeat elements in set
  item_ratings = defaultdict(set)
  max_u_id = -1
  max_i_id = -1
  user_count = 0
  item_count = 0
  review_count = 0
  users = {} #auid
  items = {} #asid
  brands = {} 
  prices = {}
  prod_desc = {} # product description
  
  random.seed(0)
  count_for_print = 0
  
  for value in review_nvf.values:
    
    # sample the data
    if (sampling and random.random()>sample_size):
      continue
    
    review_count += 1
    if(review_count > data_size):
      break
      
    auid, asid, _, brand = value[0:4]
    if value[4]:
      price = value[4]
    else:
      price = 0
    descb = value[5]

    u,i = None,None # nominal index(int) for u,i
    
    if auid in users:
      u = users[auid]
    else:
      user_count += 1
      users[auid] = user_count
      u = user_count
    
    if asid in items:
      i = items[asid]
    else:
      item_count += 1
      items[asid] = item_count
      i = item_count
      brands[i] = brand
      prices[i] = price
      # flag.. for some reason, here first assume all the descb are null
      prod_desc[i] = list(np.zeros(1024))
    
    user_ratings[u].add(i)
    item_ratings[i].add(u)
    max_u_id = max(u, max_u_id)
    max_i_id = max(i, max_i_id)
    
    # print for test
    if (count_for_print%5000 == 0):
      print("uid: ",max_u_id,'iid', max_i_id )
      count_for_print = 0
    count_for_print += 1
  
  print("max_u_id: ", max_u_id)
  print("max_i_id: ", max_i_id)
  print("reviews: ", review_count)
    
  # filter out those reviws/users with no attribute info
  num_u_id = 0
  num_i_id = 0
  num_reviews = 0
  user_ratings_filtered = defaultdict(set)
  item_ratings_filtered = defaultdict(set)
  
  for u, ids in user_ratings.iteritems():
    if len(ids) > min_items:
      # keep
      user_ratings_filtered[u] = ids
      num_u_id += 1
      num_reviews += len(ids)
  
  for ids, u in item_ratings.iteritems():
    if len(u) > min_users:
      # keep
      item_ratings_filtered[ids] = u
      num_i_id += 1
  
  print('u_id:', num_u_id)
  print('i_id:', num_i_id)
  print('reviews: ', num_reviews)
  
  return max_u_id, max_i_id, users, items, user_ratings_filtered, item_ratings_filtered, brands, prices, prod_desc
   

# 2.加载对应图片特征数据📸
根据步骤1中采样的数据矩阵，从本地加载item对应的图片特征数据vf

In [0]:
# Image features are stored in a binary format, which consists of 10 characters (the product ID),
# followed by 4096 floats (repeated for every product). 

def load_image_features(path, items):
    count=0
    image_features = {}
    f = open(path, 'rb')
    while True:
        asin = f.read(10)
        if asin == '': break
        features_bytes = f.read(16384) # 4 * 4096 = 16KB, fast read by 16KB, don't unpack
  
        if asin in items: #only unpack 4096 bytes if w need it -- big speed up
            # flag.. here why divided by 44.4722
            features = (np.fromstring(features_bytes, dtype=np.float32)/44.4722)
            iid=items[asin]
            if len(features)==0:
                image_features[iid] = np.zeros(4096)
            else:
                image_features[iid] = features
    print("load vf finished..")
    
    return image_features

# 3.构造训练集💦
从步骤1和步骤2中生成的数据矩阵中，均匀挑选(u,i,j)数据对，构造训练集

In [0]:
import tensorflow as tf
import os
#import cPickle as pickle
import numpy
import random
import matplotlib.pyplot as plt
import sys

def uniform_sample_batch(train_ratings, test_ratings, item_count, advanced_features):
  
  neg_items = 6
  for u in train_ratings.keys():
    t = []
    iv = []
    jv = []
    for i in train_ratings[u]:
      # if u is in the test set, then make sure the item(i and j) is not in the test set
      if (u in test_ratings.keys()):
        if(i != test_ratings[u]):  
          for k in range(1, neg_items):
            j = random.randint(1, item_count)
            while j in train_ratings[u]:
              j = random.randint(1, item_count)
            # someimes the item doesn;t have the coresponding attribute data(features)
            try:
              advanced_features[i]
              advanced_features[j]
            except KeyError:
              continue
              
            iv.append(advanced_features[i])
            jv.append(advanced_features[j])
            t.append([u,i,j])
            
      else:
        for k in range(1, neg_items):
          j = random.randint(1, item_count)
          while j in train_ratings[u]:
            j = random.randint(1, item_count)
          
          try:
            advanced_features[i]
            advanced_features[j]
          except KeyError:
            continue
            
          iv.append(advanced_features[i])
          jv.append(advanced_features[j])
          t.append([u,i,j])
    
    # vstack: stack arrays in sequence vertically (generate the matrix)
    # stack method is for sure of the data frame verification while running network
    if len(iv)>1:
      yield numpy.asarray(t), numpy.vstack(tuple(iv)), numpy.vstack(tuple(jv))
    else:
      continue

# 4.构造测试集
挑选生成用来测试的(u,i,j)数据组

In [0]:
# for each user, randomly select one rating into test set
# but make sure that the rating not in the training set(which done in the forward function

# the user_ratings is generated by the function "load_data_hybrid()"
# test each user appearing in the user_ratings
def generate_test(user_ratings):
  user_test = dict()
  for u, i_list in user_ratings.items():
    user_test[u] = random.sample(user_ratings[u], 1)[0]
  return user_test

#
def test_batch_generator_by_user(train_ratings, test_ratings, item_ratings, 
                                 item_count, advanced_features, 
                                 cold_start = False, 
                                 cold_start_thresh = 5):
  
  # notice: here 400 is just artificially set in experience
  for u in random.sample(test_ratings.keys(), 400):
    i = test_ratings[u]
    # 1.如果是冷启动，但项目数小于冷启动阈值：情况正常，会生成测试集，
    # 2.如果不是冷启动，并且项目数大于冷启动阈值：情况正常，会生成测试集
    
    # 冲突的两种情况：
    # 3.如果是冷启动，并且项目数大于冷启动阈值：不会生成测试集
    if (cold_start and len(item_ratings[i]) > cold_start_thresh-1):
      continue
    # 4.如果不是冷启动，并且项目数小于冷启动阈值：不会生成测试集
    #if (not cold_start and len(item_ratings[i]) < cold_start_thresh-1):
     # continue
      
    t = [] 
    ilist = []
    jlist = []
    count = 0
    #for j in range(item_count):
    for j in random.sample(range(item_count), 100):
      # find item not in test[u] and train[u]
      if j != test_ratings[u] and not (j in train_ratings[u]):
        try:
          advanced_features[i]
          advanced_features[j]
        except KeyError:
          continue
        
        count += 1
        t.append([u,i,j])
        ilist.append(advanced_features[i])
        jlist.append(advanced_features[j])
    

    if(len(ilist) == 0):
      continue
    
    yield numpy.asarray(t), numpy.vstack(tuple(ilist)), numpy.vstack(tuple(jlist))
        

# 5.BPR模型以及其变型⭐
BPR, VBPR, NVBPR, Hybrid

In [0]:
def abpr(user_count, item_count, advanced_features, bpr_extension= True,
         hidden_dim=10, hidden_img_dim=10,
         l2_regulization=0.1,
         bias_regulization=0.01,
         embed_regulization = 0,
         image_regulization =0.1,
         visual_bias_regulization = 0.01):
    """
    user_count: 用户的总数
    item_count: 物品的总数
    hidden_dim: MF的hidden dimensions
    hidden_img_dim: MF的visual feature/non-visual feature hidden dimensions
    """
    advanced_feat_dim = len(advanced_features[1])
    u = tf.placeholder(tf.int32, [None])
    i = tf.placeholder(tf.int32, [None])
    j = tf.placeholder(tf.int32, [None])
    iv = tf.placeholder(tf.float32, [None, advanced_feat_dim])
    jv = tf.placeholder(tf.float32, [None, advanced_feat_dim])

    # model parameters -- LEARN THESE
    # latent factors
    user_emb_w = tf.get_variable("user_emb_w", [user_count + 1, hidden_dim],
                                 initializer=tf.random_normal_initializer(0, 0.1))
    item_emb_w = tf.get_variable("item_emb_w", [item_count + 1, hidden_dim],
                                 initializer=tf.random_normal_initializer(0, 0.1))

    # UxD visual factors for users
    user_img_w = tf.get_variable("user_img_w", [user_count + 1, hidden_img_dim],
                                 initializer=tf.random_normal_initializer(0, 0.1))
    # this is E, the embedding matrix
    img_emb_w = tf.get_variable("img_emb_w", [hidden_img_dim, advanced_feat_dim],
                                initializer=tf.random_normal_initializer(0, 0.1))

    visual_bias = tf.get_variable("visual_bias", [1, advanced_feat_dim], initializer=tf.random_normal_initializer(0, 0.1))

    # biases
    item_b = tf.get_variable("item_b", [item_count + 1, 1], initializer=tf.constant_initializer(0.0))

    # pull out the respective latent factor vectors for a given user u and items i & j
    u_emb = tf.nn.embedding_lookup(user_emb_w, u)
    i_emb = tf.nn.embedding_lookup(item_emb_w, i)
    j_emb = tf.nn.embedding_lookup(item_emb_w, j)
    # pull out the visual factor, 1 X D for user u
    u_img = tf.nn.embedding_lookup(user_img_w, u)
    # get the respective biases for items i & j
    i_b = tf.nn.embedding_lookup(item_b, i)
    j_b = tf.nn.embedding_lookup(item_b, j)


    # MF predict: u_i > u_j
    # MF predict: u_i > u_j
    theta_i = tf.matmul(iv, img_emb_w, transpose_b=True)  # (f_i * E), eq. 3
    theta_j = tf.matmul(jv, img_emb_w, transpose_b=True)  # (f_j * E), eq. 3
    if bpr_extension:
        xui = i_b + tf.reduce_sum(tf.multiply(u_emb, i_emb), 1, keep_dims=True) + tf.reduce_sum(tf.multiply(u_img, theta_i), 1, keep_dims=True) \
                                                                            + tf.reduce_sum(tf.multiply(visual_bias, iv), 1, keep_dims=True) 
        xuj = j_b + tf.reduce_sum(tf.multiply(u_emb, j_emb), 1, keep_dims=True) + tf.reduce_sum(tf.multiply(u_img, theta_j), 1, keep_dims=True) \
                                                                            + tf.reduce_sum(tf.multiply(visual_bias, jv), 1, keep_dims=True) 
    else:
        xui = i_b + tf.reduce_sum(tf.multiply(u_emb, i_emb), 1, keep_dims=True)
        xuj = j_b + tf.reduce_sum(tf.multiply(u_emb, j_emb), 1, keep_dims=True) 
    
    xuij = xui - xuj

    auc = tf.reduce_mean(tf.to_float(xuij > 0))

    l2_norm = tf.add_n([
        l2_regulization * tf.reduce_sum(tf.multiply(u_emb, u_emb)),
        image_regulization * tf.reduce_sum(tf.multiply(u_img, u_img)),
        l2_regulization * tf.reduce_sum(tf.multiply(i_emb, i_emb)),
        l2_regulization * tf.reduce_sum(tf.multiply(j_emb, j_emb)),
        embed_regulization * tf.reduce_sum(tf.multiply(img_emb_w, img_emb_w)),
        bias_regulization * tf.reduce_sum(tf.multiply(i_b, i_b)),
        bias_regulization * tf.reduce_sum(tf.multiply(j_b, j_b)),
        visual_bias_regulization * tf.reduce_sum(tf.multiply(visual_bias, visual_bias))
    ])

    loss = l2_norm - tf.reduce_mean(tf.log(tf.sigmoid(xuij)))
    train_op = tf.train.AdamOptimizer().minimize(loss)
    return u, i, j, iv, jv, loss, auc, train_op

# 6.tensorflow运行🏃‍
将上面的步骤运行..从加载数据，到训练数据，到测试数据..

In [0]:
def session_run(num_iter, bpr_extension, advanced_features, review_nvf):
    ### Loading and parsing the review matrix for Cell Phones 5-core dataset
    auc_train = []
    auc_test = []
    auc_test_cs = []
    
    user_count, item_count, users, items, user_ratings, item_ratings, brands, prices, prod_desc = load_data_hybrid(review_nvf, min_items=4, min_users=0, sampling= True, sample_size = 0.5)
    user_ratings_test = generate_test(user_ratings)
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.variable_scope('abpr'):
            u, i, j, iv, jv, loss, auc, train_op = abpr(user_count, item_count, advanced_features, bpr_extension=bpr_extension)

        session.run(tf.global_variables_initializer())
        

        for epoch in range(1, num_iter+1):
            print "epoch ", epoch
            _loss_train = 0.0
            user_count = 0
            auc_train_values = []
            for d, _iv, _jv in uniform_sample_batch(user_ratings, 
                                                    user_ratings_test, 
                                                    item_count, 
                                                    advanced_features):
                user_count += 1
                _loss, _auc, _ = session.run([loss, auc, train_op], feed_dict= {u:d[:,0], i:d[:,1], j:d[:,2], iv:_iv, jv:_jv})
                _loss_train += _loss
                auc_train_values.append(_auc)
            print "train_loss:", _loss_train/user_count, "train auc: ", numpy.mean(auc_train_values)
            auc_train.append(numpy.mean(auc_train_values))


            auc_values = []
            _loss_test = 0.0
            user_count = 0
            for d, _iv, _jv in test_batch_generator_by_user(user_ratings, user_ratings_test, item_ratings, item_count, advanced_features, cold_start = False):
                user_count += 1
                _loss, _auc = session.run([loss, auc], feed_dict={u: d[:, 0], i: d[:, 1], j: d[:, 2], iv: _iv, jv: _jv})
                _loss_test += _loss
                auc_values.append(_auc)
            print "test_loss: ", _loss_test / user_count, "test auc: ", numpy.mean(auc_values)
            auc_test.append(numpy.mean(auc_values))

            auc_values_cs = []
            _loss_test_cs = 0.0
            user_count = 1
            for d, _iv, _jv in test_batch_generator_by_user(user_ratings, user_ratings_test, item_ratings, item_count, advanced_features, cold_start = True, cold_start_thresh = 10):
                user_count += 1
                _loss, _auc = session.run([loss, auc], feed_dict={u: d[:, 0], i: d[:, 1], j: d[:, 2], iv: _iv, jv: _jv})
                _loss_test_cs += _loss
                auc_values_cs.append(_auc)
            print "cold start test_loss: ", _loss_test_cs / user_count, "cold start auc: ", numpy.mean(auc_values_cs)
            auc_test_cs.append(numpy.mean(auc_values_cs))
        return auc_train, auc_test, auc_test_cs

# 7.开始实际操作..🖱
## 7.1 运行加载数据

In [0]:
data_path = 'reviews_desc.csv'
review_nvf = load_nvf(data_path)

  if self.run_code(code, result):


reviews with non_visual_features loaded..


In [0]:
user_count, item_count, users, items, user_ratings, item_ratings, brands, prices, prod_desc = load_data_hybrid(review_nvf, min_items=4, min_users=0, sampling= True, sample_size = 0.5)

print("start loading vf..")
images_path = "movies_visual_features"
f = load_image_features(images_path, items)

('uid: ', 1, 'iid', 1)
('uid: ', 4870, 'iid', 488)
('uid: ', 9570, 'iid', 733)
('uid: ', 13932, 'iid', 809)
('uid: ', 17995, 'iid', 906)
('uid: ', 22129, 'iid', 981)
('uid: ', 25925, 'iid', 1173)
('uid: ', 29843, 'iid', 1309)
('uid: ', 33646, 'iid', 1400)
('uid: ', 37387, 'iid', 1501)
('uid: ', 41436, 'iid', 1606)
('uid: ', 45191, 'iid', 1668)
('uid: ', 48751, 'iid', 1725)
('uid: ', 52284, 'iid', 1797)
('uid: ', 55918, 'iid', 1883)
('uid: ', 59413, 'iid', 1933)
('uid: ', 62887, 'iid', 2016)
('uid: ', 66748, 'iid', 2121)
('uid: ', 70605, 'iid', 2160)
('uid: ', 74505, 'iid', 2206)
('uid: ', 78059, 'iid', 2224)
('uid: ', 81432, 'iid', 2260)
('uid: ', 84921, 'iid', 2308)
('uid: ', 88329, 'iid', 2371)
('uid: ', 91846, 'iid', 2466)
('uid: ', 95162, 'iid', 2506)
('uid: ', 98376, 'iid', 2573)
('uid: ', 101649, 'iid', 2612)
('uid: ', 104958, 'iid', 2709)
('uid: ', 108338, 'iid', 2778)
('uid: ', 111418, 'iid', 2856)
('uid: ', 114619, 'iid', 2920)
('uid: ', 117490, 'iid', 3013)
('uid: ', 120378, 

  del sys.path[0]


load vf finished..


In [0]:
bpr_auc_train, bpr_auc_test, bpr_auc_test_cold_start = session_run(20, False, f, review_nvf)

('uid: ', 1, 'iid', 1)
('uid: ', 4870, 'iid', 488)
('uid: ', 9570, 'iid', 733)
('uid: ', 13932, 'iid', 809)
('uid: ', 17995, 'iid', 906)
('uid: ', 22129, 'iid', 981)
('uid: ', 25925, 'iid', 1173)
('uid: ', 29843, 'iid', 1309)
('uid: ', 33646, 'iid', 1400)
('uid: ', 37387, 'iid', 1501)
('uid: ', 41436, 'iid', 1606)
('uid: ', 45191, 'iid', 1668)
('uid: ', 48751, 'iid', 1725)
('uid: ', 52284, 'iid', 1797)
('uid: ', 55918, 'iid', 1883)
('uid: ', 59413, 'iid', 1933)
('uid: ', 62887, 'iid', 2016)
('uid: ', 66748, 'iid', 2121)
('uid: ', 70605, 'iid', 2160)
('uid: ', 74505, 'iid', 2206)
('uid: ', 78059, 'iid', 2224)
('uid: ', 81432, 'iid', 2260)
('uid: ', 84921, 'iid', 2308)
('uid: ', 88329, 'iid', 2371)
('uid: ', 91846, 'iid', 2466)
('uid: ', 95162, 'iid', 2506)
('uid: ', 98376, 'iid', 2573)
('uid: ', 101649, 'iid', 2612)
('uid: ', 104958, 'iid', 2709)
('uid: ', 108338, 'iid', 2778)
('uid: ', 111418, 'iid', 2856)
('uid: ', 114619, 'iid', 2920)
('uid: ', 117490, 'iid', 3013)
('uid: ', 120378, 

In [0]:
vbpr_auc_train, vbpr_auc_test, vbpr_auc_test_cold_start = session_run(20, True, f, review_nvf)

('uid: ', 1, 'iid', 1)
('uid: ', 4870, 'iid', 488)
('uid: ', 9570, 'iid', 733)
('uid: ', 13932, 'iid', 809)
('uid: ', 17995, 'iid', 906)
('uid: ', 22129, 'iid', 981)
('uid: ', 25925, 'iid', 1173)
('uid: ', 29843, 'iid', 1309)
('uid: ', 33646, 'iid', 1400)
('uid: ', 37387, 'iid', 1501)
('uid: ', 41436, 'iid', 1606)
('uid: ', 45191, 'iid', 1668)
('uid: ', 48751, 'iid', 1725)
('uid: ', 52284, 'iid', 1797)
('uid: ', 55918, 'iid', 1883)
('uid: ', 59413, 'iid', 1933)
('uid: ', 62887, 'iid', 2016)
('uid: ', 66748, 'iid', 2121)
('uid: ', 70605, 'iid', 2160)
('uid: ', 74505, 'iid', 2206)
('uid: ', 78059, 'iid', 2224)
('uid: ', 81432, 'iid', 2260)
('uid: ', 84921, 'iid', 2308)
('uid: ', 88329, 'iid', 2371)
('uid: ', 91846, 'iid', 2466)
('uid: ', 95162, 'iid', 2506)
('uid: ', 98376, 'iid', 2573)
('uid: ', 101649, 'iid', 2612)
('uid: ', 104958, 'iid', 2709)
('uid: ', 108338, 'iid', 2778)
('uid: ', 111418, 'iid', 2856)
('uid: ', 114619, 'iid', 2920)
('uid: ', 117490, 'iid', 3013)
('uid: ', 120378, 

## 7.2 运行几个模型

## 7.3 结果对比分析