## **電影推薦應用實作**
--------------------------------

In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
import tabulate
import matplotlib
import math

from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

pd.set_option("display.precision", 1)

### **\# 資料集來源: 自己**  

In [2]:
df = pd.read_excel("data.xlsx")
left_aligned_df = df.style.set_properties(**{'text-align': 'left'})
left_aligned_df = left_aligned_df.set_table_styles([dict(selector = 'th', props=[('text-align', 'left')])])
display(left_aligned_df)

Unnamed: 0,ID,電影名稱,類別,user1,user2,user3,user4,user5,x1 (comedy),x2 (Animation),x3 (Science fiction)
0,1,讓子彈飛,comedy,5,3,1,2,1,0.9,0.1,0.1
1,2,九品芝麻官,comedy,4,3,1,2,1,0.9,0.1,0.1
2,3,殭屍100,Animation,1,2,4,3,4,0.7,0.9,0.1
3,4,奧術,Animation,1,2,4,3,4,0.3,0.9,0.5
4,5,獵巫行動,Science fiction,1,5,2,1,3,0.1,0.1,0.9
5,6,侏儸紀公園,Science fiction,1,5,2,1,3,0.1,0.5,0.9
6,7,世界異戰,Science fiction,1,5,2,1,3,0.1,0.1,0.9
7,8,終末女武神,Animation,1,2,4,3,4,0.2,0.9,0.1
8,9,航海王,Animation,1,2,4,3,4,0.5,0.9,0.1


--------------------------------------------

### **\#根據上方資料，可得出以下資訊用於計算cost function:**
##### **觀眾數量 : num_users = 5**
##### **電影數量 : num_movies = 9**
##### **類別數量 : num_features = 3**
##### **$\mathbf{X}$ 代表每部電影的類別分數，例如【讓子彈飛】: $\mathbf{x}^{(1)}$=[0.90, 0.1, 0.1]**
##### **$\mathbf{Y}$ 代表每部電影對應於各使用者的評分，例如【奧術】: $\mathbf{y}^{(4)}$=[1, 2, 4, 3, 4]**
##### **$\mathbf{R}$ = $r$ (num_movies, num_users)，代表user是否評分過該電影，例如【讓子彈飛】: $r(1,)$=[1, 1, 1, 1, 1]**
##### **$\mathbf{W}$ = $w$ (num_users, num_features)，為亂數產生，需透過訓練校正**
##### **$\mathbf{b}$ = $b$ (1, num_users)，為亂數產生，，需透過訓練校正**

In [3]:
import random

num_users=5
num_movies=9
num_features=3
X=[]
Y=[]
R=[]
W=[]
b=[]

df_numpy=df.to_numpy()
for i in range(num_movies):
    X.append([])
    Y.append([])
    R.append([])
    for j in range(11):
        if j >=3 and j <=7:
            Y[i].append(float(df_numpy[i,j]))
            if df_numpy[i,j]>0:
                R[i].append(1.0)
            else:
                R[i].append(0.0)
        if j>=8 and j<11:
            X[i].append(df_numpy[i,j])
b.append([])
for i in range(num_users):
    W.append([])
    b[0].append(random.uniform(-10,10))
    for j in range(num_features):
        W[i].append(random.uniform(-10,10))
        

In [4]:
sum=0
for j in range(num_users):
    if R[0][j]==1:
        sum+=Y[0][j]
tsmean = sum/num_users
print(f"所有用戶對於第一部電影的平均評價:{tsmean:0.1f}顆星" )

所有用戶對於第一部電影的平均評價:2.4顆星


-------------------------------------------------------------
### **\#定義 Cost function**
<a name="4.1"></a>
#### **collaborative filtering cost function**
$$J({\mathbf{x}^{(0)},...,\mathbf{x}^{(n_m-1)},\mathbf{w}^{(0)},b^{(0)},...,\mathbf{w}^{(n_u-1)},b^{(n_u-1)}})= \left[ \frac{1}{2}\sum_{(i,j):r(i,j)=1}(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2 \right]
+ \underbrace{\left[
\frac{\lambda}{2}
\sum_{j=0}^{n_u-1}\sum_{k=0}^{n-1}(\mathbf{w}^{(j)}_k)^2
+ \frac{\lambda}{2}\sum_{i=0}^{n_m-1}\sum_{k=0}^{n-1}(\mathbf{x}_k^{(i)})^2
\right]}_{regularization}
\tag{1}$$
**The first summation in (1) is "for all $i$, $j$ where $r(i,j)$ equals $1$" and could be written:**

$$
= \left[ \frac{1}{2}\sum_{j=0}^{n_u-1} \sum_{i=0}^{n_m-1}r(i,j)*(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2 \right]
+\text{regularization}
$$

In [5]:
def cofi_cost_func(X, W, b, Y, R, lambda_):
    J = 0 
    for i in range(num_users):
        w=W[i][:]
        for k in range(num_movies):
            x=X[k][:]
            y=Y[k][i]
            r=R[k][i]
            J += np.square(r * (np.dot(w,x) + b[0][i] - y ) )
        J +=lambda_* (np.sum(np.square(W)) + np.sum(np.square(X)))
    J=J/2         
    return J

In [6]:
# Evaluate cost function with regularization 
J = cofi_cost_func(X, W, b, Y, R, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost (with regularization): 3812.80


#### **下面這是Tensorflow提供的計算方法**

In [7]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

------------------------------
# **開始模擬**
### **1.假設有一位新用戶加入，先將num_users+1**

In [8]:
# 亂數產生 W、b
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users+1,  num_features),dtype=tf.float64),  name='W')
b = tf.Variable(tf.random.normal((1,num_users+1),dtype=tf.float64) ,  name='b')

### **2.將該用戶目前對每部電影的評分資訊，新增至Y、R中**
  
####    **\#假設他已將【奧術】評價5分，【侏儸紀】3分，【九品芝麻官】1分，其他電影尚未觀看=0分**

In [9]:
Y[1].append(1.0)
Y[3].append(5.0)
Y[5].append(3.0)

#其他尚未觀看都評為0分
for i in range(num_movies):
    if i not in(1,3,5):
        Y[i].append(0.0)
        R[i].append(0.0)
    else:
        R[i].append(1.0)

In [10]:
#正規化Y
Y_mean=[]
minY=min(map(min, Y))
maxY=max(map(max, Y))
for i in range(num_movies):
    mean=0
    for j in range(num_users):
        Y[i][j]=(Y[i][j]-minY)/(maxY-minY)
        mean+=Y[i][j]
    Y_mean.append([mean/num_users])

### **3.開始訓練**

In [11]:
X = tf.Variable(tf.convert_to_tensor(X, dtype=tf.float64),  name='X')
# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

iterations=140
lambda_=1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:
        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Y, R, lambda_)
    
    cost_value=tf.convert_to_tensor(cost_value,dtype=tf.float64)
    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient(cost_value, [X,W,b] ,unconnected_gradients=tf.UnconnectedGradients.ZERO)
    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients(zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 105.8
Training loss at iteration 20: 10.1
Training loss at iteration 40: 5.7
Training loss at iteration 60: 3.9
Training loss at iteration 80: 3.5
Training loss at iteration 100: 3.4
Training loss at iteration 120: 3.4


### **4.推薦**
#### **訓練完可得參數 W、b，還有類別分數X**
#### **藉此可得出矩陣p，大小為(9x6)，代表每部電影對每位使用者的推薦程度，數值越高表示越適合該用戶**

In [12]:
p=np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
p=p+Y_mean
print(p)

[[0.92223045 1.15511704 0.93031895 0.87261235 0.99794646 3.13618808]
 [1.07583134 1.19625804 0.69661854 0.75830814 0.7703216  2.13748997]
 [0.86791026 1.18376756 1.14156697 1.00100087 1.20737294 3.85042271]
 [0.6627894  1.09820356 1.34653679 1.07946185 1.40604994 4.86469037]
 [0.84642914 1.1327141  1.00022807 0.89560184 1.06894056 3.47478946]
 [0.84544631 1.13243386 1.00112789 0.89589402 1.06986561 3.4789893 ]
 [0.84642914 1.1327141  1.00022807 0.89560184 1.06894056 3.47478946]
 [0.86763798 1.18362589 1.14185667 1.00112435 1.20765377 3.85167843]
 [0.86782746 1.18373045 1.14165129 1.00103448 1.20745635 3.85078965]]


-----------------------
### **5.印證結果可得 Animation(動畫)較適合優先推薦給該用戶**

In [13]:
results=[]
for i in range(num_movies):
    if i not in(1,3,5):
        results.append([df['ID'][i],df['電影名稱'][i],df['類別'][i],str(round(p[i][5],1))])
result_df=pd.DataFrame(results, columns=['ID', '電影名稱','類別','推薦度(new)'])
result_df

Unnamed: 0,ID,電影名稱,類別,推薦度(new)
0,1,讓子彈飛,comedy,3.1
1,3,殭屍100,Animation,3.9
2,5,獵巫行動,Science fiction,3.5
3,7,世界異戰,Science fiction,3.5
4,8,終末女武神,Animation,3.9
5,9,航海王,Animation,3.9
