In [1]:
import pandas as pd
import numpy as np

# Predicting movie ratings

In [88]:
rating = np.array([
    [5,5,0,0],
    [5,np.NAN,np.NAN,0],
    [np.NAN,4,0,np.NAN],
    [0,0,5,4],
    [0,0,5,np.NAN]
    ])

feature = np.array([
    [0.9, 0],
    [1, 0.01],
    [0.99, 0],
    [0.1, 1],
    [0, 0.9]
    ])

rating_df = pd.DataFrame(rating, columns=['Alice(0)', 'Bob(1)', 'Carol(2)', 'Dave(3)'])
feature_df = pd.DataFrame(feature, columns=['x_1(romance)', 'x_2(action)'])
table_df = pd.concat([rating_df, feature_df], axis=1)
table_df

Unnamed: 0,Alice(0),Bob(1),Carol(2),Dave(3),x_1(romance),x_2(action)
0,5.0,5.0,0.0,0.0,0.9,0.0
1,5.0,,,0.0,1.0,0.01
2,,4.0,0.0,,0.99,0.0
3,0.0,0.0,5.0,4.0,0.1,1.0
4,0.0,0.0,5.0,,0.0,0.9


In this table:

**Notaion:**
- $n_u$ = no. of users = 4
- $n_m$ = no. of movies = 5
- $r(i, j)$ = 1 if user $j$ has rated movie $i$ (0 otherwise) = $r(1,1) = 1, r(3,1) = 0$
- $y^{(i, j)}$ = rating given by user $j$ on movie $i$ (if defined) = $y^{(1, 1)} = 5, y^{(4, 2)} = 0$

for user $j$ and movie $i$, yict rating: 

$$w^{(j)} * x^{(i)} + b^{(j)}$$

In [4]:
# Let's say we have learn parameter w
w_1 = np.array([5,0])
w_2 = np.array([5,0])

x_2 = feature[1]
x_3 = feature[2]

b_1 = 0
b_2 = 0

In [5]:
y_20 = np.dot(x_3, w_1.T) + b_1
y_11 = np.dot(x_2, w_2.T) + b_2

print(f'yiction for movie 2 rated by Alice(0) is: {y_20}')
print(f'yiction for movie 1 rated by Bob(1) is: {y_11}')

yiction for movie 2 rated by Alice(0) is: 4.95
yiction for movie 1 rated by Bob(1) is: 5.0


Let's loop whole table

In [6]:
# Let's say we have learn parameter w

n_u, n_m = rating.shape
n = feature.shape[1]

w = np.zeros([n_u, n])
w[0] = np.array([5,0])
w[1] = np.array([5,0])
w[2] = np.array([0,5])
w[3] = np.array([0,4])

X = feature

b = np.zeros(n_u)

In [8]:
def linear_regression(X, w, b):
    
    pred = np.dot(X, w.T) + b

    return pred

In [9]:
def plot_prediction(input, X, w, b, pred_func):
    
    m = input.shape[0]
    n = input.shape[1]
    
    # Copy input
    pred = input[:]
    
    # Get all {np.NaN} in {input}
    non_nan_idx = np.argwhere(np.isnan(input))

    # Replace {np.NaN} in {table_pred_df} with prediction
    for i,j in non_nan_idx:
        pred[i,j] = pred_func(w[j], X[i], b[j])
        
    return pred

In [10]:
table_pred = plot_prediction(input=rating, X=feature, w=w, b=b, pred_func=linear_regression)
pd.DataFrame(table_pred, columns=['Alice(0)', 'Bob(1)', 'Carol(2)', 'Dave(3)'])

Unnamed: 0,Alice(0),Bob(1),Carol(2),Dave(3)
0,5.0,5.0,0.0,0.0
1,5.0,5.0,0.05,0.0
2,4.95,4.0,0.0,0.0
3,0.0,0.0,5.0,4.0
4,0.0,0.0,5.0,3.6


## Cost Function: Learn from given Users' rating

**Notation:**
- $r(i, j)$ = 1 if user $j$ has rated movie $i$ (0 otherwise)
- $y^{(i, j)}$ = rating given by user $j$ on movie $i$ (if defined)
- $w^{(j)}, b^{(j)}$ = parameters for user $j$
- $x^{(i)}$ = feature vector for movie $i$
- $n$ = no. of features
- $n_u$ = no. of users
- $n_m$ = no. of movies

for user $j$ and movie $i$, predict rateing: $w^{(j)} * x^{(i)} + b^{(j)}$
- $m^{(j)}$ = no. movies rated by user $j$ to learn $w^{(j)}$, $b^{(j)}$

To learn $w^{(j)}, b^{(j)}$

$$ \displaystyle\min_{w^{(j)}, b^{(j)}}  J_(w^{(j)}, b^{(j)}) =\frac{1}{2 \cancel{m^{(j)}}} \sum_{i:r(i,j)=1} (w^{(j)} * x^{(i)} + b^{(j)} - y^{(i, j)})^2 \textcolor{orange}{ + \frac{\lambda}{2 \cancel{m^{(j)}}} \sum^n_{k=1} (w^{(j)}_k)^2} $$

it's ok to eliminate $m^{(j)}$

$$ \displaystyle\min_{w^{(j)}, b^{(j)}} J_(w^{(j)}, b^{(j)}) = \frac{1}{2} \sum_{i:r(i,j)=1} (w^{(j)} * x^{(i)} + b^{(j)} - y^{(i, j)})^2 \textcolor{orange}{ + \frac{\lambda}{2} \sum^n_{k=1} (w^{(j)}_k)^2} $$

sum only movie $i$ has rated

In [12]:
def linear_cost_single(input, X, w, b, lambda_, pred_func):
    '''
    w np.array(1,n) = Vector of learning parameter
    '''
    
    # Find index of non np.NAN
    non_nan_idx = np.argwhere(np.invert(np.isnan(input)))[:,0]
    
    # Create empty shape
    pred = np.empty(input.shape)
    reg_wk = 0
    reg_w = 0
    error_sum = 0
    
    # no. of users = 1
    n_u = 1
    # no. of movies
    n_m = input.shape[0]
    # no. of features
    n = X.shape[1]

    for i in range(n_m):
        pred[i] = pred_func(X[i], w=w, b=b)
        
    for k in range(n):
        reg_wk += (w[k] ** 2)
    reg_w += reg_wk 

    for i in non_nan_idx:
        error = (pred[i] - input[i])
        error_sum += error
    
    cost = (error_sum + (lambda_ * reg_w)) / 2
    
    # print(pred)

    return cost

In [13]:
linear_cost_single(input=rating[:,0], X=feature, w=w[0], b=b[0], lambda_=0.001, pred_func=linear_regression)

0.0125

### Cost function: Summary

To learn parameter $w^{(j)}, b^{(j)}$ for user $j$:

$$ J_(w^{(j)}, b^{(j)}) = \frac{1}{2} \sum_{i:r(i,j)=1} (w^{(j)} * x^{(i)} + b^{(j)} - y^{(i, j)})^2 \textcolor{orange}{+ \frac{\lambda}{2} \sum^n_{k=1} (w^{(j)}_k)^2} $$

<br>

To learn parameter $w^{(1)}, b^{(1)}, w^{(2)}, b^{(2)}, \dots, w^{(n_u)}, b^{(n_u)}$ for all users: <br>
<span style="color:grey">In other word sum(user) of sum(cost)</span>

$$ J\binom{w^{(1)}, \dots, w^{(n_u)}} {b^{(j)}, \dots, b^{(n_u)}} = \frac{1}{2}  \sum^{n_u}_{j=1} \sum_{i:r(i,j)=1} (w^{(j)} * x^{(i)} + b^{(j)} - y^{(i, j)})^2 \textcolor{orange}{+ \frac{\lambda}{2} \sum^{n_u}_{j=1} \sum^n_{k=1} (w^{(j)}_k)^2} $$


In [82]:
rating = np.array([
    [5,5,0,0],
    [5,np.NAN,np.NAN,0],
    [np.NAN,4,0,np.NAN],
    [0,0,5,4],
    [0,0,5,np.NAN]
    ])

feature = np.array([
    [0.9, 0],
    [1, 0.01],
    [0.99, 0],
    [0.1, 1],
    [0, 0.9]
    ])

# Lambda for regularization
lambda_ = 0.001

b = np.ones(rating.shape[1])

In [83]:
def cofi_cost_func(input, X, W, b, lambda_):

    n_m, n_u = input.shape
    
    J = 0
    cost = 0
    
    R = np.isfinite(input)
    
    Y = np.nan_to_num(input, nan=0)

    for j in range(n_u):
        # Slice only j user need
        w = W[j,:]
        b_j = b[j]
        for i in range(n_m):
            x = X[i,:]
            y = Y[i,j]
            r = R[i,j]
            J += np.square(r * (np.dot(w, x) + b_j - y))
    
    reg = (lambda_/2) * (np.sum(np.square(W)) + np.sum(np.square(X)))
    cost = J / 2 + reg

    return cost

In [84]:
cofi_cost_func(input=rating, X=feature, W=w, b=b, lambda_=lambda_)

9.114855100000002

In [85]:
def cofi_cost_func_v(input, X, W, b, lambda_):
    # R = np.isfinite(input)
    J = np.square(np.matmul(X, W.T) + b - input)
    J_sum = np.sum(np.nan_to_num(J, nan=0))
    reg = (lambda_/2) * (np.sum(np.square(X)) + np.sum(np.square(W)))
    
    return J_sum /2 + reg

In [86]:
b = np.ones([1, n_u])
cofi_cost_func_v(input=rating, X=feature, W=w, b=b, lambda_=0.001)

9.1148551

## Collaborative filtering algorithm

Problem: If we have paramters learned w,b but not X feature. How could we train it?

In [87]:
rating = np.array([
    [5,5,0,0],
    [np.NAN,np.NAN,np.NAN,0],
    [np.NAN,4,0,np.NAN],
    [0,0,5,4],
    [0,0,5,np.NAN]
    ])

# Create {np.NaN} array
feature = np.ones([5,2])

rating_df = pd.DataFrame(rating, columns=['Alice(0)', 'Bob(1)', 'Carol(2)', 'Dave(3)'])
feature_df = pd.DataFrame(feature, columns=['x_1(romance)', 'x_2(action)'])
table_df = pd.concat([rating_df, feature_df], axis=1)
table_df

Unnamed: 0,Alice(0),Bob(1),Carol(2),Dave(3),x_1(romance),x_2(action)
0,5.0,5.0,0.0,0.0,1.0,1.0
1,,,,0.0,1.0,1.0
2,,4.0,0.0,,1.0,1.0
3,0.0,0.0,5.0,4.0,1.0,1.0
4,0.0,0.0,5.0,,1.0,1.0


Let's say we have learned paremeter $w^{(j)}$, now let's find $x^{(i)}$


In [None]:
# Let's say we have learn parameter w
w = np.empty([4,2])
w[0] = np.array([1,0])
w[1] = np.array([1,0])
w[2] = np.array([0,1])
w[3] = np.array([0,1])

for user $j$ and movie $i$, predict rating: 

$$ w^{(j)} * x^{(i)} + \cancel{b^{(j)}} $$


to find $x^{(i)}$

$ w^{(1)} * x^{(1)} \approx 5 \rightarrow x^{(i)} = \begin{bmatrix} 1 \\ 0 \end{bmatrix} $ <br>

$ w^{(2)} * x^{(1)} \approx 5 \rightarrow x^{(i)} = \begin{bmatrix} 1 \\ 0 \end{bmatrix} $ <br>

$ w^{(3)} * x^{(1)} \approx 0 \rightarrow x^{(i)} = \begin{bmatrix} 0 \\ 1 \end{bmatrix} $ <br>

$ w^{(4)} * x^{(1)} \approx 0 \rightarrow x^{(i)} = \begin{bmatrix} 0 \\ 1 \end{bmatrix} $ <br>

## Collaborative Filtering

gather data from multiple users to predict (Collaborative)

<br>

Given $ w^{(1)}, b^{(1)},w^{(2)}, b^{(2)}, ..., w^{(n_u)}, b^{(n_u)} $

to learn $x^{(i)}$ :

$$ J(x^{(i)}) = \frac{1}{2} \sum_{j:r(i,j)=1} (w^{(j)} * x^{(i)} + b^{(j)} - y^{(i, j)})^2 \textcolor{orange}{+ \frac{\lambda}{2} \sum^n_{k=1} (x^{(i)}_k)^2} $$

<br>

To learn parameter $ x^{(1)}, x^{(2)}, ..., x^{(n_m)} $ : <br>
<span style="color:grey">In other word sum(user) of sum(cost)</span>

$$ J\binom{w^{(1)}, \dots, w^{(n_u)}} {b^{(j)}, \dots, b^{(n_u)}} = \frac{1}{2}  \sum^{n_u}_{i=1} \sum_{j:r(i,j)=1} (w^{(j)} * x^{(i)} + b^{(j)} - y^{(i, j)})^2 \textcolor{orange}{+ \frac{\lambda}{2} \sum^{n_m}_{i=1} \sum^n_{k=1} (x^{(i)}_k)^2} $$

put it together:

$$ \displaystyle\min_{ \substack{ w^{(1)}, \dots, w^{(n_u)} \\ {b^{(1)}, \dots, b^{(n_u)}} \\ {x^{(1)}, \dots, x^{(n_m)}} } } J(w, b, x) =  \frac{1}{2} \sum_{(i,j):r(i,j)=1} (w^{(j)} * x^{(i)} + b^{(j)} - y^{(i, j)})^2 \textcolor{orange}{ + \frac{\lambda}{2} \sum^{n_u}_{j=1} \sum^n_{k=1} (w^{(j)}_k)^2 + \frac{\lambda}{2}  \sum^{n_m}_{i=1} \sum^n_{k=1} (x^{(i)}_k)^2 } $$
    

In [None]:
def collaborative_cost(input, X, w, b, lambda_, pred_func):
    '''
    w np.array(m,n) = matrix of learning parameter
    '''
    
    # Find index of non np.NAN
    non_nan_idx = np.argwhere(np.invert(np.isnan(input)))
    
    # Create empty shape
    pred = np.empty(input.shape)
    reg_wk = 0
    reg_w = 0
    reg_Xk = 0
    reg_X = 0 
    error_sum = 0
    
    # no. of users
    n_u = input.shape[1]
    # no. of movies
    n_m = input.shape[0]
    # no. of features
    n = X.shape[1]

    for j in range(n_u):
        # Our learning depend on user so we loop movies each user first
        for i in range(n_m):
            pred[i,j] = pred_func(X[i], w=w[j], b=b[j])
            
        for k in range(n):
            reg_wk += (w[j,k] ** 2)
            reg_Xk += (X[j,k] ** 2)
        reg_w += reg_wk
        reg_X += reg_Xk 

    for i,j in non_nan_idx:
        error = (pred[i,j] - input[i,j])
        error_sum += error
        
    reg = lambda_ * (reg_w + reg_X)
    cost = (error_sum + reg) / 2
    
    # print(pred)
    
    return cost

In [None]:
collaborative_cost(input=rating, X=feature, w=w, b=b, lambda_=lambda_, pred_func=linear_regression)

-6.99985

To minimized cost
## Gradient Descent

$$ w_k^{(j)} = w_k^{(j)} - \alpha \frac{\delta}{\delta w_k^{(j)}} J(w, b, x) $$

$$ b^{(j)} = b^{(j)} - \alpha \frac{\delta}{\delta b^{(j)}} J(w, b, x) $$

$$ x_k^{(i)} = x_k^{(i)} - \alpha \frac{\delta}{\delta x_k^{(i)}} J(w, b, x) $$

$$ J(w, b, x) =  \frac{1}{2} \sum_{(i,j):r(i,j)=1} (w^{(j)} * x^{(i)} + b^{(j)} - y^{(i, j)})^2 \textcolor{orange}{ + \frac{\lambda}{2} \sum^{n_u}_{j=1} \sum^n_{k=1} (w^{(j)}_k)^2 + \frac{\lambda}{2}  \sum^{n_m}_{i=1} \sum^n_{k=1} (x^{(i)}_k)^2 } $$

$$ \frac{\delta}{\delta w_i^{(j)}}J(w, b, x) = \sum_{(i,j):r(i,j)=1} (w^{(j)} * x^{(i)} + b^{(j)} - y^{(i, j)})w^{(j)} \textcolor{orange}{ + \lambda[\sum^{n_u}_{j=1} \sum^n_{k=1} (w^{(j)}_k) + \sum^{n_m}_{i=1} \sum^n_{k=1} (x^{(i)}_k)]] } $$

In [None]:
def gradient_descent(input, X, w, b, iter, lambda_, alpha, pred_func, cost_func):
    
    # no. of users
    n_u = input.shape[1]
    # no. of movies
    n_m = input.shape[0]
    # no. of features
    n = X.shape[1]
    
    # Get all index all {np.NaN} in {rating}
    non_nan_idx = np.argwhere(np.invert(np.isnan(input)))
    
    # Create empty shape
    pred = np.empty(input.shape)
    
    cost_hist = []
    
    for x in range(iter):
        
        cost_j = 0
        reg_wk = 0
        reg_w = 0
        reg_Xk = 0
        reg_X = 0
    
        for j in range(n_u):
            for i in range(n_m):
                pred[i,j] = pred_func(X[i], w=w[j], b=b[j])
          
            for k in range(n):
                reg_wk += w[j,k] * w[j,k]
            reg_w += reg_wk
            
        for i in range(n_m):
            for k in range(n):
                reg_Xk += X[i,k] * X[i,k]
            reg_X += reg_Xk
        
        reg = lambda_ * (reg_w + reg_X)
        
        for i,j in non_nan_idx:
            error = (pred[i,j] - input[i,j])
            error_sum += error

        for i,j in non_nan_idx:
            w[j] = w[j] - alpha * ((cost_j * X[i]))
            b[j] = b[j] - alpha * (cost_j)
            X[i] = X[i] - alpha * ((cost_j * w[j]))
        
        cost = cost_func(input, X, y, w, b, lambda_)
        cost_hist.append(cost)
        
        # if len(cost_hist) > 1:
        #     if cost_hist[x] - cost <  0.00001:
        #         break
            
    return w, b, X, cost, cost_hist

In [None]:


w_out, b_out, X_out = gradient_descent(input=rating, X=feature, w=w_init, b=b_init, iter=iter, lambda_=lambda_, alpha=alpha, pred_func=linear_regression, cost_func=collaborative_cost)

NameError: name 'w_init' is not defined

In [None]:
table_pred = plot_prediction(input=rating, X=X_out, w=w_out, b=b_out, pred_func=linear_regression)
pd.concat([pd.DataFrame(table_pred, columns=['Alice(0)', 'Bob(1)', 'Carol(2)', 'Dave(3)']), pd.DataFrame(X_out, columns=['x_1(romance)', 'x_2(action)'])], axis=1)

Unnamed: 0,Alice(0),Bob(1),Carol(2),Dave(3),x_1(romance),x_2(action)
0,5.0,5.0,0.0,0.0,0.3,
1,,,,0.0,,
2,,4.0,0.0,,,
3,0.0,0.0,5.0,4.0,0.0,
4,0.0,0.0,5.0,,,


Wow we got 0.31 cost, and we fill all missing data in the table.

## Binary Label

For yes or no, 0 or 1 question.

$$ g(w^{(j)} * x^{(i)} + b^{(j)}) $$

In [None]:
def logistic_regression(X, w, b):
    
    z = np.dot(w, X) + b
    g = 1/(1 + np.exp(-z))
    
    pred = g

    return pred

In [None]:
table_pred = plot_prediction(rating, X_out, w_out, b_out, logistic_regression)
pd.DataFrame(table_pred, columns=['Alice(0)', 'Bob(1)', 'Carol(2)', 'Dave(3)'])

Unnamed: 0,Alice(0),Bob(1),Carol(2),Dave(3)
0,5.0,5.0,0.0,0.0
1,,,,0.0
2,,4.0,0.0,
3,0.0,0.0,5.0,4.0
4,0.0,0.0,5.0,


In [None]:
def logistic_cost(input, X, y, w, b):

    j_wbx = 0
    
    # Get all index all {np.NaN} in {rating}
    non_nan_idx = np.argwhere(np.invert(np.isnan(input)))
    
    for i,j in non_nan_idx:
        z = np.dot(w[j], X[i]) + b[j]
        g = 1/(1 + np.exp(-z))
        pred = g
        
        loss = -y[i,j] * np.log(pred) -(1 - y[i,j]) * np.log(1 - pred)
        j_wbx += loss 
        
    return j_wbx

In [None]:
# logistic_cost(rating, X_out, target, w_out, b_out)

In [None]:
# bi_rating = np.array([
#     [0,1,0,np.NaN],
#     [1,0,np.NaN,0],
#     [1,1,0,np.NaN],
#     [np.NaN,0,1,1],
#     [np.NaN,0,np.NaN,1]])

# n_m =rating.shape[0]
# n_u = rating.shape[1]
# n = feature.shape[1]

# bi_feature = np.zeros([n_m, 2])

# bi_target = np.array([
#     [0,1,0,0],
#     [1,0,1,0],
#     [1,1,0,0],
#     [0,0,1,1],
#     [1,0,1,1]])