In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# 准备数据

In [2]:
animes_df = pd.read_csv('E:/DataMining/final/anime.csv')
ratings_df = pd.read_csv('E:/DataMining/final/rating.csv')
animes_df = animes_df.dropna()
ratings_df = ratings_df.dropna()

In [3]:
animes_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
ratings_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [5]:
animes_df = animes_df.rename(columns={'rating':'anime_rating'})
ratings_df = ratings_df.rename(columns={'rating':'user_rating'})
ratings_df = ratings_df[ratings_df['user_id']<=1000]
animes_df['anime_row'] = animes_df.index
uratings_df = pd.merge(ratings_df, animes_df, on='anime_id')

In [6]:
uratings_df = uratings_df[['user_id', 'anime_row', 'user_rating']]

In [7]:
uratings_df.head()

Unnamed: 0,user_id,anime_row,user_rating
0,1,841,-1
1,3,841,8
2,5,841,6
3,6,841,-1
4,10,841,-1


# 构建动画评分矩阵urating和评分记录矩阵record

In [8]:
userNo = uratings_df['user_id'].max()+1
animeNo = uratings_df['anime_row'].max()+1

In [9]:
urating = np.zeros((animeNo, userNo))

flag = 0
uratings_df_length = np.shape(uratings_df)[0]

for index, row in uratings_df.iterrows():
    urating[int(row['anime_row']), int(row['user_id'])] = row['user_rating']
    flag += 1
    if flag % 5000 == 0:
        print('processed %d, %d left' % (flag, uratings_df_length-flag))

processed 5000, 91479 left
processed 10000, 86479 left
processed 15000, 81479 left
processed 20000, 76479 left
processed 25000, 71479 left
processed 30000, 66479 left
processed 35000, 61479 left
processed 40000, 56479 left
processed 45000, 51479 left
processed 50000, 46479 left
processed 55000, 41479 left
processed 60000, 36479 left
processed 65000, 31479 left
processed 70000, 26479 left
processed 75000, 21479 left
processed 80000, 16479 left
processed 85000, 11479 left
processed 90000, 6479 left
processed 95000, 1479 left


In [10]:
urating

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 8., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
record = urating>0
record = np.array(record, dtype=int)
print(record)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# 建立模型

In [12]:
# 对矩阵进行归一化
def normalizeRatings(rating, record):
    m, n = rating.shape
    rating_mean = np.zeros((m, 1))
    rating_norm = np.zeros((m, n))
    for i in range(m):
        idx = record[i, :] !=0
        rating_mean[i] = np.mean(rating[i, idx])
        rating_norm[i, idx] -= rating_mean[i]
    return rating_norm, rating_mean

In [13]:
rating_norm, rating_mean = normalizeRatings(urating, record)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [14]:
# 使用0代替数组x中的nan元素，使用有限的数字代替inf元素
rating_norm = np.nan_to_num(rating_norm)
rating_mean = np.nan_to_num(rating_mean)

In [15]:
# 变量初始化
num_features = 10
X_parameters = tf.Variable(tf.random_normal([animeNo, num_features], stddev=0.35))
Theta_paramters = tf.Variable(tf.random_normal([userNo, num_features], stddev=0.35))
loss = 1/2 * tf.reduce_sum(((tf.matmul(X_parameters, Theta_paramters, transpose_b=True) - rating_norm)*record)**2) + \
    1/2 * (tf.reduce_sum(X_parameters**2) + tf.reduce_sum(Theta_paramters**2))
optimizer = tf.train.AdamOptimizer()
train = optimizer.minimize(loss)

# 训练模型

In [16]:
tf.summary.scalar('loss', loss)
summaryMerged = tf.summary.merge_all()
filename = './anime_tensorboard'
writer = tf.summary.FileWriter(filename)

In [17]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [18]:
penalty = animeNo*userNo

for i in range(5000):
    l, _, anime_summary = sess.run([loss, train, summaryMerged])
    if i%100 == 0:
        Current_X_parameters, Current_Theta_parameters = sess.run([X_parameters, Theta_paramters])
        predicts = np.dot(Current_X_parameters,Current_Theta_parameters.T) + rating_mean
        errors = np.mean((predicts - urating)**2)
        print('step:', i, ' train loss:%.5f' % (l/penalty), ' test loss:%.5f' % errors)
    writer.add_summary(anime_summary, i)

step: 0  train loss:0.19934  test loss:20.05331
step: 100  train loss:0.19348  test loss:19.93365
step: 200  train loss:0.16847  test loss:18.73147
step: 300  train loss:0.12297  test loss:15.65239
step: 400  train loss:0.07452  test loss:11.63015
step: 500  train loss:0.03756  test loss:7.97349
step: 600  train loss:0.01663  test loss:5.42218
step: 700  train loss:0.00750  test loss:3.94906
step: 800  train loss:0.00409  test loss:3.17102
step: 900  train loss:0.00283  test loss:2.75935
step: 1000  train loss:0.00233  test loss:2.53069
step: 1100  train loss:0.00211  test loss:2.39559
step: 1200  train loss:0.00201  test loss:2.31068
step: 1300  train loss:0.00195  test loss:2.25407
step: 1400  train loss:0.00191  test loss:2.21419
step: 1500  train loss:0.00187  test loss:2.18463
step: 1600  train loss:0.00185  test loss:2.16167
step: 1700  train loss:0.00182  test loss:2.14311
step: 1800  train loss:0.00180  test loss:2.12755
step: 1900  train loss:0.00177  test loss:2.11414
step: 2

In [19]:
Current_X_parameters, Current_Theta_parameters = sess.run([X_parameters, Theta_paramters])
predicts = np.dot(Current_X_parameters,Current_Theta_parameters.T) + rating_mean
errors = np.mean((predicts - urating)**2)
print(errors)

1.9988971548139247


# 动漫推荐系统

In [20]:
user_id = input('您要向哪位用户进行推荐？请输入用户编号：')

sortedResult = predicts[:, int(user_id)].argsort()[::-1]

idx = 0
print('为该用户推荐的评分最高的20部动画是：'.center(80, '='))
for i in sortedResult:
    print('评分：%.2f, 动画名：%s' % (predicts[i, int(user_id)], animes_df.iloc[i]['name']))
    idx += 1
    if idx == 20: break

您要向哪位用户进行推荐？请输入用户编号：73
评分：8.98, 动画名：Dirty Pair
评分：6.37, 动画名：Tokimeki Tonight
评分：6.37, 动画名：Tobe! Isami
评分：6.35, 动画名：Ring ni Kakero 1: Nichibei Kessen-hen
评分：6.34, 动画名：Uchuu Senkan Yamato: Kanketsu-hen
评分：6.34, 动画名：Lady Georgie
评分：6.34, 动画名：Ginga Tetsudou 999: Kimi wa Haha no You ni Aiseru ka!!
评分：6.34, 动画名：Ai Shoujo Pollyanna Story
评分：6.34, 动画名：Pucca
评分：6.33, 动画名：Ro-Kyu-Bu!: Tomoka no Ichigo Sundae
评分：6.33, 动画名：Ougon Bat
评分：6.33, 动画名：Shinkon Gattai Godannar!! 2nd Season
评分：6.33, 动画名：Sekai Meisaku Douwa: Hakuchou no Mizuumi
评分：6.33, 动画名：Shoukoushi Cedie
评分：6.32, 动画名：Hello! Sandybell
评分：6.32, 动画名：Hoshi no Oujisama Petit Prince
评分：6.31, 动画名：Usavich
评分：6.31, 动画名：Miracle Giants Doumu-kun
评分：6.30, 动画名：Bananya
评分：6.29, 动画名：Bosco Daibouken
