In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
df = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.data', delimiter=r'\t',
                 names=['user_id', 'item_id', 'rating', 'timestamp']) 
 
r = df.pivot(index='user_id', columns='item_id', values='rating').values

  df = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.data', delimiter=r'\t',


In [3]:
print(f"total nan element: {np.count_nonzero(np.isnan(r))} total not nan element: {np.count_nonzero(~np.isnan(r))}")

total nan element: 1486126 total not nan element: 100000


# Functions


In [40]:
def find_text_indexes(data:np.ndarray,ratio:float=0.20)->np.ndarray:
    """
    data: numpy array
    ratio: float, ratio of the data to be splitted
    return: indexes of test data
     """
    not_nan_elements_indexes=np.argwhere(~np.isnan(r))
    split_number = int(len(not_nan_elements_indexes)*ratio)
    idx = np.random.choice(len(not_nan_elements_indexes), split_number, replace=False)
    return not_nan_elements_indexes[idx]

In [41]:
def train_data_maker(data:np.ndarray, indexes:np.ndarray)->np.ndarray:
    """
    r: numpy array
    indexes: indexes of test data
    return: train data
    """
    return_data=data.copy()
    for index in indexes:
        return_data[index[0]][index[1]] = np.nan
    
    return return_data

In [69]:
def part_1_function(raw_data:np.ndarray,train_data:np.ndarray,test_indexes:np.ndarray,alpha:float=0.001,epoch:int=500):
  """
  raw_data: numpy array
  train_data: numpy array
  test_indexes :numpy array
  alpha: float

  raw_data for validation total error
  train_data for training
  test_indexes for test data indexes
  alpha is learning rate

  """
  m,n=train_data.shape
  #initilize user_b and item_b
  user_b= np.random.random(m) 
  item_b=np.random.random(n)
  row,col=np.where(~np.isnan(train_data))
  with trange(epoch) as epoch_size:
    for _ in epoch_size:
      total_error=0
      test_error=0
      prev_user_b=user_b.copy()
      prev_item_b=item_b.copy()
      for i,j in zip(row,col):
        y_pred=user_b[i]+item_b[j]
        e=train_data[i][j]-y_pred
        #gradients for user_b and item_b
        g_user=-e
        g_item=-e
        #update user_b and item_b
        user_b[i]-=alpha*g_user
        item_b[j]-=alpha*g_item
        total_error+=e**2
        if np.linalg.norm(user_b - prev_user_b) < (alpha / 10) and np.linalg.norm(item_b - prev_item_b) < (alpha / 10):  
          print(f"I do early stoping at iteration {_}")
          break
      for test_index in test_indexes:
            test_pred=user_b[test_index[0]]+item_b[test_index[1]]
            test_error+=raw_data[test_index[0]][test_index[1]]-test_pred
      epoch_size.set_description(f'Total Square Error: {total_error:.2f} Validation Square Error: {test_error:.2f}')
   
   
  


In [None]:
def part_2_function(raw_data:np.ndarray,train_data:np.ndarray,test_indexes:np.ndarray,alpha:float=0.001,lambd:float=0.1,epoch:int=500)->np.ndarray:
  """
  raw_data: numpy array
  train_data: numpy array
  test_indexes :numpy array
  alpha: float
  lambd: float

  raw_data for validation total error
  train_data for training
  test_indexes for test data indexes
  alpha is learning rate
  lambd for regulurazation to avoid overfit

  """
  m,n=train_data.shape
  test_error=0
  #initilize user_b and item_b
  user_b= np.random.random(m) 
  item_b=np.random.random(n)
  row,col=np.where(~np.isnan(train_data))
  with trange(epoch) as epoch_size:
    for _ in epoch_size:
      total_error=0
      test_error=0
      prev_user_b=user_b.copy()
      prev_item_b=item_b.copy()
      for i,j in zip(row,col):
        y_pred=user_b[i]+item_b[j]
        e=train_data[i][j]-y_pred
        #gradients for user_b and item_b
        g_user=-e+ lambd* user_b[i]
        g_item=-e + lambd *item_b[j]
        #update user_b and item_b
        user_b[i]-=alpha*g_user
        item_b[j]-=alpha*g_item
        total_error+=e**2
        if np.linalg.norm(user_b - prev_user_b) < (alpha / 10) and np.linalg.norm(item_b - prev_item_b) < (alpha / 10):  
          print(f"I do early stoping at iteration {_}")
          break
      for test_index in test_indexes:
            test_pred=user_b[test_index[0]]+item_b[test_index[1]]
            test_error+=raw_data[test_index[0]][test_index[1]]-test_pred
      epoch_size.set_description(f'Total Square Error: {total_error:.2f} Validation Square Error: {test_error:.2f}')
  return test_error

In [42]:
test_indexes=find_text_indexes(r)
train_data=train_data_maker(r,test_indexes)


In [11]:
?part_1_function

[1;31mSignature:[0m
[0mpart_1_function[0m[1;33m([0m[1;33m
[0m    [0mraw_data[0m[1;33m:[0m [0mnumpy[0m[1;33m.[0m[0mndarray[0m[1;33m,[0m[1;33m
[0m    [0mtrain_data[0m[1;33m:[0m [0mnumpy[0m[1;33m.[0m[0mndarray[0m[1;33m,[0m[1;33m
[0m    [0mtest_indexes[0m[1;33m:[0m [0mnumpy[0m[1;33m.[0m[0mndarray[0m[1;33m,[0m[1;33m
[0m    [0malpha[0m[1;33m:[0m [0mfloat[0m [1;33m=[0m [1;36m0.001[0m[1;33m,[0m[1;33m
[0m    [0mepoch[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m500[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [0mnumpy[0m[1;33m.[0m[0mndarray[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
raw_data: numpy array
train_data: numpy array
test_indexes :numpy array
alpha: float

raw_data for validation total error
train_data for training
test_indexes for test data indexes
alpha is learning rate
[1;31mFile:[0m      c:\users\pain\appdata\local\temp\ipykernel_7884\3522702363.py
[1;31mType:[0m      function


In [59]:
part_1_function(raw_data=r,train_data=train_data,test_indexes=test_indexes)

Total Square Error: 41609.90 Validation Square Error: 94.15: 100%|██████████| 500/500 [04:47<00:00,  1.74it/s]  


In [65]:
val_error=part_2_function(raw_data=r,train_data=train_data,test_indexes=test_indexes)

Total Square Error: 43300.77 Validation Square Error: 2246.16: 100%|██████████| 500/500 [05:06<00:00,  1.63it/s]


# Search for best lambda 

In [64]:
val_errors=[]
lambdas=[0.1,0.2,0.4,0.5]
for lambd in lambdas:
    val_error=part_2_function(raw_data=r,train_data=train_data,test_indexes=test_indexes,lambd=lambd)
    print(val_error)
    val_errors.append(val_error)
print(f"Best lambda value is : {lambdas[val_errors.index(min(val_errors))}")]