In [1]:
import numpy as np
import pandas as pd

In [2]:
small_data_pathname = '/Users/maxperozek/ML-CP341/3/ml-100k/u.data'

small_data = pd.read_csv(small_data_pathname, sep='\t', header=None)
small_data.columns = ['user_id','item_id','rating','timestamp']

In [3]:
small_data

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [4]:
np.unique(small_data.loc[:,['item_id']].to_numpy()).shape

(1682,)

In [131]:
# map 1-5 values to 0-1
def normalize(x, old_range, new_range=(0,1)):
    new = ((x - old_range[0])/(old_range[1] - old_range[0])) * (new_range[1] - new_range[0]) + new_range[0]
    return new

def build_rating_vecs(rating_df, normalize_rating=False):
    user_ids = np.unique(rating_df.loc[:,['user_id']].to_numpy())
    movie_ids = np.unique(rating_df.loc[:,['item_id']].to_numpy())
    
    # placeholer
    embeddings = np.full((movie_ids.shape[0], user_ids.shape[0]), np.nan)
    
    for _, row in rating_df.iterrows():
        movie = np.where(movie_ids == row['item_id'])[0][0]
        user = np.where(user_ids == row['user_id'])[0][0]
        
        rating = row['rating']
        
        # if normalize_rating:
        #     rating = normalize(rating, old_range=(1,5))
        
        embeddings[movie,user] = rating
    
    # return embeddings
    X = embeddings

    avg_ratings = np.nanmean(X, axis=1)
    no_ratings = np.argwhere(np.isnan(X))

    for i in range(no_ratings.shape[0]):
        X[no_ratings[i,0],no_ratings[i,1]] = avg_ratings[no_ratings[i,0]]
        
    return normalize(X,old_range=(1,5))

In [132]:
X = build_rating_vecs(small_data, normalize_rating=True)

In [134]:
np.where(X == 0)

(array([   0,    0,    0, ..., 1677, 1677, 1677]),
 array([ 14, 198, 330, ..., 940, 941, 942]))

In [128]:
# X = build_rating_vecs(small_data, normalize_rating=True)

# avg_ratings = np.nanmean(X, axis=1)
# avg_ratings[np.where(avg_ratings == 0)] = 2.5
# no_ratings = np.argwhere(np.isnan(X))

# for i in range(no_ratings.shape[0]):
#     X[no_ratings[i,0],no_ratings[i,1]] = avg_ratings[no_ratings[i,0]]

In [23]:
from scipy.spatial.distance import cdist

In [24]:
all_dists = cdist(X,X)

In [25]:
def euc(a, b): 
    return np.sqrt(np.sum((a-b) ** 2))

In [26]:
a = np.zeros((900,))
b = np.zeros((900,))

a[1] = 1
b[800] = 1

euc(a,b)

1.4142135623730951

In [27]:
euc(X[0,:],X[1,:])

14.874474780643517

In [28]:
all_dists

array([[ 0.        , 14.87447478, 15.3785565 , ..., 16.07404741,
        16.05654072, 16.05654072],
       [14.87447478,  0.        ,  7.74596669, ...,  6.89202438,
         6.86931583,  6.86931583],
       [15.3785565 ,  7.74596669,  0.        , ...,  5.61248608,
         5.62916512,  5.58457698],
       ...,
       [16.07404741,  6.89202438,  5.61248608, ...,  0.        ,
         0.55901699,  0.55901699],
       [16.05654072,  6.86931583,  5.62916512, ...,  0.55901699,
         0.        ,  0.70710678],
       [16.05654072,  6.86931583,  5.58457698, ...,  0.55901699,
         0.70710678,  0.        ]])

In [29]:
dists_no_self = (np.eye(all_dists.shape[0]) * 1e10) + all_dists

In [30]:
np.where(dists_no_self == 0)

(array([ 313,  313,  313, ..., 1677, 1677, 1677]),
 array([ 436,  438,  598, ..., 1658, 1660, 1670]))

In [31]:
dists_no_self.shape

(1682, 1682)

In [32]:
k = 100
idx_list = np.argsort(dists_no_self.reshape((-1,)))[:k]

In [33]:
idx_list

array([2640631, 2647347, 2647346, 2647344, 2647342, 2647333, 2647331,
       2252981, 2647279, 2647278, 2647271, 2671000, 2671001, 2647217,
       2671003, 2647193, 2297361, 2647159, 2647158, 2647151, 2647348,
       2647349, 2647350, 2647351, 2647372, 2647371, 2647369, 2647368,
       2647367, 2647366, 2647365, 2647364, 2647362, 2647149, 2647361,
       2297423, 2297422, 2647357, 2647356, 2647355, 2647354, 2647353,
       2647352, 2297415, 2647360, 2647148, 2647144, 2647139, 2297272,
       2646637, 2297270, 2646615, 2297269, 2297268, 2646569, 2297263,
       2646384, 2646643, 2297258, 2646222, 2646099, 2297254, 2644099,
       2297249, 2644092, 2297238, 2297237, 2673989, 2646224, 2647386,
       2297277, 2297281, 2647137, 2297337, 2647134, 2647133, 2647128,
       2647126, 2647125, 2647124, 2647119, 2297278, 2647114, 2647105,
       2297303, 2297302, 2647094, 2647093, 2297293, 2297292, 2297288,
       2297283, 2647110, 2755011, 2647403, 2647411, 2648251, 2648066,
       2781711, 2647

In [34]:
def get_idx(idx, col_len):
    row = int(idx/col_len)
    col = idx % col_len
    return row, col

In [35]:
r, c = get_idx(idx_list[0], dists_no_self.shape[0])

In [36]:
dists_no_self[r,c]

0.0

In [37]:
# sanity check

movie_id_list = []
for i in range(10):
    r, c = get_idx(idx_list[i], dists_no_self.shape[0])
    movie_id_list.append((r,c))

In [38]:
# https://stackoverflow.com/questions/15956169/parsing-a-pipe-delimited-file-in-python
info_pathname = '/Users/maxperozek/ML-CP341/3/ml-100k/u.item'

fileHandle = open(info_pathname, 'r', encoding="ISO-8859-1")

movie_info = []
for line in fileHandle:
    fields = line.split('|')
    movie_info.append(fields)

fileHandle.close()

movie_info = np.array(movie_info)

In [39]:
movie_info.shape

(1682, 24)

In [40]:
for row, col in movie_id_list:
    print(movie_info[row-1,1])
    print(movie_info[col-1,1], '\n')

Vie est belle, La (Life is Rosey) (1987)
Spirits of the Dead (Tre passi nel delirio) (1968) 

Spirits of the Dead (Tre passi nel delirio) (1968)
Tigrero: A Film That Was Never Made (1994) 

Spirits of the Dead (Tre passi nel delirio) (1968)
Clean Slate (Coup de Torchon) (1981) 

Spirits of the Dead (Tre passi nel delirio) (1968)
Aparajito (1956) 

Spirits of the Dead (Tre passi nel delirio) (1968)
Condition Red (1995) 

Spirits of the Dead (Tre passi nel delirio) (1968)
Show, The (1995) 

Spirits of the Dead (Tre passi nel delirio) (1968)
Frankie Starlight (1995) 

Stefano Quantestorie (1993)
Milk Money (1994) 

Spirits of the Dead (Tre passi nel delirio) (1968)
Modern Affair, A (1995) 

Spirits of the Dead (Tre passi nel delirio) (1968)
Window to Paris (1994) 



In [41]:
all_mins = np.amin(dists_no_self, axis=0)
largest_min = np.amax(all_mins)
dissimilar_movie_id = np.where(all_mins == largest_min)
movie_name = movie_info[dissimilar_movie_id[0] -1,1][0]

movie_name

'Secrets & Lies (1996)'

In [42]:
np.unique(np.argsort(all_dists)[:,0]).shape

(1575,)

# Test Predictions

In [135]:
train_pathname = '/Users/maxperozek/ML-CP341/3/ml-100k/u1.base'
test_pathname = '/Users/maxperozek/ML-CP341/3/ml-100k/u1.test'

train_df = pd.read_csv(train_pathname, sep='\t', header=None)
test_df = pd.read_csv(test_pathname, sep='\t', header=None)
train_df.columns = ['user_id','item_id','rating','timestamp']
test_df.columns = ['user_id','item_id','rating','timestamp']

train_mat = train_df.to_numpy()

train_data = build_rating_vecs(train_df,normalize_rating=True)

In [136]:
train_mat

array([[        1,         1,         5, 874965758],
       [        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

In [137]:
dists = cdist(train_data,train_data)
dists = (np.eye(dists.shape[0]) * 1e10) + dists

In [138]:
dists.shape

(1650, 1650)

In [139]:
test_X = test_df.loc[:,['user_id','item_id']].to_numpy()
test_y = np.squeeze(test_df.loc[:,['rating']].to_numpy())

In [140]:
test_X

array([[  1,   6],
       [  1,  10],
       [  1,  12],
       ...,
       [459, 934],
       [460,  10],
       [462, 682]])

In [141]:
def weighted_average(ratings, distances):
    
    # return arr.mean()
    # print('old',arr.mean())

    total_dist = distances.sum()

    if total_dist == 0:
        total_dist = 1e-10
    weights_pcts = (distances / total_dist)
    weights_pcts = weights_pcts[np.flip(np.argsort(weights_pcts))]

    # print('new',np.sum(ratings * weights_pcts))
    return np.sum(ratings * weights_pcts)
    


def test_preds(train_mat, dists, test_X, test_y, k):
    
    error = np.empty((test_X.shape[0],))
    # for each rating in the held out test dataset
    for i in range(test_X.shape[0]):
        this_user = test_X[i,0]
        target_movie = test_X[i,1]
        
        # the movies this user has rated:
        user_rated_movies = train_mat[np.where(train_mat[:,0] == this_user),1]
        
        # get the k nearest movies
        dist_2_target = np.squeeze(dists[target_movie, user_rated_movies])
        KNN = np.argsort(dist_2_target)[:k]
        
        # get average rating of KNN
        pred = weighted_average(train_mat[KNN,2], dist_2_target[KNN])
        
        error[i] = (pred - test_y[i]) ** 2
        
    return error 


In [142]:
error = test_preds(train_mat, dists, test_X, test_y, 5)

In [143]:
np.mean(error)

1.6616075500686143