In [1]:
# Task:
# An SVD Recommender that predicts the rating a user will give to a movie
# based on the user's own ratings and other users' rating data.

# Use only 'rating' as the data, avoid 'tags' and 'genre'

# 80/20, train/test split. Additionally, do a temporal split. 


In [2]:
# imports
import pandas as pd
from numpy.linalg import svd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
import math
import random

In [3]:
# read data
movies = 'data/movielens-latest-small/movies.csv'
ratings = 'data/movielens-latest-small/ratings.csv'

# to dataframes
df_movies = pd.read_csv(movies)
df_ratings = pd.read_csv(ratings)

# inspect them
display('Movies')
display(df_movies.head())
display('Ratings')
display(df_ratings.head())

'Movies'

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


'Ratings'

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# 80/20, train/test split
df_ratings_x = df_ratings[['userId', 'movieId', 'timestamp']]
df_ratings_y = df_ratings[['rating', 'timestamp']]

x_train, x_test, y_train, y_test = train_test_split(df_ratings_x, df_ratings_y, test_size=0.2, random_state=1)
print(f"Training rows = {x_train.shape[0]}")
print(f"Testing rows = {x_test.shape[0]}")

#display(x_train.head())
#display(x_test.head())
#display(y_train.head())
#display(y_test.head())

# temporal split
tscv = TimeSeriesSplit(n_splits=2, test_size=20000)
for i, (train_index, test_index) in enumerate(tscv.split(x_train, y_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

print(f"Training rows temporal split = {train_index.shape[0]}")
print(f"Testing rows temporal split = {test_index.shape[0]}")

Training rows = 80668
Testing rows = 20168
Fold 0:
  Train: index=[    0     1     2 ... 40665 40666 40667]
  Test:  index=[40668 40669 40670 ... 60665 60666 60667]
Fold 1:
  Train: index=[    0     1     2 ... 60665 60666 60667]
  Test:  index=[60668 60669 60670 ... 80665 80666 80667]
Training rows temporal split = 60668
Testing rows temporal split = 20000


In [5]:
# Consider reviews from users with more than 50 reviews
#usercount = df_ratings[['movieId','userId']].groupby('userId').count()
#display(usercount.head())

In [6]:
# Source for SVD stuff: https://machinelearningmastery.com/using-singular-value-decomposition-to-build-a-recommender-system/
# Build a pivot table with movieIds as columns 
# and users and their ratings as rows
rating_matrix = df_ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)
display(rating_matrix.head())
matrix = rating_matrix.values
display(matrix)


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [7]:
# Singular value decomposition
u, s, vh = np.linalg.svd(matrix, full_matrices=False)
# We know that the columns of vh are movies
# The rows of u are users

In [8]:
def cosine_similarity(v,u):
    return (v @ u)/ (np.linalg.norm(v) * np.linalg.norm(u))

In [9]:
# Generate similarities to all others users for a single user
def get_similarities(user_index, u):
    similarities = []
    
    for row in range(0, u.shape[0]):
        #if row != user_index:
        similarity  = cosine_similarity(u[user_index, :], u[row, :])
        similarities.append(similarity)
    
    return similarities

similarities = get_similarities(0, u)
#display(similarities)

# Predict a rating using the U matrix from the SVD operation
def predict_rating(user_index, movie_index, ratings_matrix, u, k=5):
    # Get the ratings for the target user and the movie
    target_user_ratings = ratings_matrix[user_index, :]
    movie_ratings = ratings_matrix[:, movie_index]

    # Calculate cosine similarity between the target user and all other users
    # 1-d matrix of length users
    similarities  = get_similarities(user_index, u)
    
    # Sort the similarities and get the indices of the nearest neighbors
    nearest_neighbors = np.argsort(similarities)[::-1][1::]  # Exclude the target user
    
    # Calculate weighted average of ratings from nearest neighbors
    weighted_ratings = 0
    total_similarity = 0
    for neighbor_index in nearest_neighbors:
        neighbor_similarity = similarities[neighbor_index]
        neighbor_rating = ratings_matrix[neighbor_index, movie_index]
        if neighbor_rating != 0:  # Ignore if neighbor hasn't rated the movie
            weighted_ratings += neighbor_similarity * neighbor_rating
            total_similarity += neighbor_similarity
    
    # Predict the rating for the target user
    if total_similarity != 0:
        predicted_rating = weighted_ratings / total_similarity
    else:
        predicted_rating = 0  # In case none of the nearest neighbors have rated the movie
    
    return predicted_rating

predicted_rating = predict_rating(user_index = 0, movie_index = 0, ratings_matrix = matrix, u = u)
print("Predicted rating:", predicted_rating)

Predicted rating: 1.062015503875969


In [18]:
# K-means RMSE Clustering 
# select k random data points (user indexes in u) as initial cluster centers C_1, ..., C_k
k = 5
random.seed(1)

def initialize_centroids(data_points, k):
    # Select k random indexes from the data points list
    centroid_indexes = random.sample(range(data_points.shape[0]), k)
    
    # Construct the list of centroids using the selected indexes
    centroids = [data_points[i] for i in centroid_indexes]
    
    return centroids

def assign_point_to_centroid(point, closest_centroid, assignments):
    # convert centroid to tuple for hashability
    closest_centroid_tuple = tuple(closest_centroid)
    
    display(closest_centroid)
    display(assignments)
    if closest_centroid_tuple not in assignments:
        assignments[closest_centroid_tuple ] = [point]
    else:
        assignments[closest_centroid_tuple ].append(point)

def update_centroid_position(centroid, new_position):
    centroid[:] = new_position
    
initial_centroids = initialize_centroids(u, k)
assignments = {}

# for each p (user) in u, map p_i to its nearest cluster center C_j 
for i, p in enumerate(u):
    # Find the closest centroid to the current data point
    closest_centroid = max(initial_centroids, key=lambda centroid: cosine_similarity(p, centroid))
    
    # Assign the point to the closest centroid    
    assign_point_to_centroid(p, closest_centroid, assignments)
    break

for centroid in initial_centroids:
    # Update centroid position by taking the mean of assigned data points
    
    # convert centroid to tuple for hashability
    centroid_tuple = tuple(centroid)
    
    assigned_points = assignments[centroid_tuple]  # get all data points assigned to a centroid
    if assigned_points:
        new_position = mean(assigned_points)
        
         # calculate Euclidean distance
        difference = np.linalg.norm(centroid - new_position)
        display(difference)
        
        update_centroid_position(centroid, new_position)
        break
        
       

array([-8.58065297e-03,  5.91195617e-03,  1.22308431e-02, -1.13904883e-02,
       -2.34989304e-02, -7.28491110e-03,  1.23433603e-02, -2.13440229e-02,
        2.80708852e-02,  1.55837541e-03,  2.22658774e-03,  9.38638279e-03,
       -8.94429705e-03,  1.27634859e-02,  1.57755362e-02,  9.06190097e-03,
        1.22891899e-02,  9.98297431e-03, -5.96683857e-03,  1.28640381e-02,
        1.53656035e-02,  1.19677609e-02,  3.98036514e-03,  7.81239794e-03,
       -4.79812652e-03, -4.89367945e-03, -8.11135089e-03,  1.25321189e-02,
        7.52847666e-03,  1.87589569e-02,  1.75420479e-02, -1.22629739e-02,
       -3.49279690e-03,  1.42268216e-02, -1.40639639e-02, -1.86853195e-02,
        6.93307812e-03, -1.62249453e-03,  4.84139367e-04, -1.81760651e-02,
        1.23724982e-02, -8.01954942e-03,  3.32582804e-03,  9.08077896e-03,
       -1.05897594e-02, -9.54093819e-04, -3.07715724e-03, -3.61065794e-03,
       -1.77501433e-02,  4.57186085e-03, -4.81890639e-03, -6.27942728e-03,
       -1.58136741e-02,  

{}

KeyError: (-0.0018288086744643725, -0.0021490221680925915, -0.003172909059712625, -0.0017594331713546606, 0.00011317166677794145, 0.000443520168753731, 0.0022473445936203786, -0.0010652385590969836, -0.002645763924895561, 0.00544033281487763, -4.679681680276613e-05, 0.0005802457720369058, -0.005152682754894517, 0.0003809002527677763, -0.0015549816290862358, 0.0017334989354834292, -0.0036127634769211967, 0.002348353228834049, -0.0023735408468744134, -0.006205691601139467, -0.002324729617467638, 0.009088688588650544, -0.0009527461229227132, 0.003226851193006082, -0.0010767574957852755, -0.0057991495583760745, -0.0012456310233366936, -0.0014081808778273436, -0.00470886675100656, 0.0005877087249213844, 0.003479487149897314, 0.008131858765849535, -0.0029134975431994465, 0.007016994151983856, -0.00025525612906256764, 0.001177190932352785, 0.0035700311713014994, -0.00132428163316986, -0.003718701439533457, 0.001725648423079484, 0.0031272729978018728, -0.005559343511407475, 0.002614736987686156, 0.0016161819296590294, 0.0017271181725503184, -0.002820898304116757, -0.01154358035585255, 0.008096394080220667, 0.0011234557885217056, 0.007286422560779679, -0.0005028759883890672, 0.002557403572064832, -0.007106871373008422, -0.004095465051061535, -0.00030358353520686717, 0.006173505231667629, 0.004246659961167984, -0.005258560346520148, 0.01025450528534236, -0.00755829590407844, 0.0008642085675647096, -0.0016153393608904412, -0.0005061830359734927, -0.0019846303493439577, -0.0035049937505164225, -0.0069339435564938785, 0.003197780480354903, 0.0015193773568733262, -0.00591346472254492, -0.01244409239948892, -0.0008714782498061261, -0.00635648394441648, -0.000657970736518907, -0.0001719486717577743, 0.004769810684957875, 0.0020105037704303045, 0.0089915708386838, 0.010700409714474996, 0.007837878519381608, 0.006250113858843569, -0.0008654465678653369, 0.0036519961741827674, 0.007215042093707004, 0.013078383037232924, 0.008337263754836094, 0.00514231580509952, 0.006491937607584904, 0.003051533121381329, -0.008225328623067248, -0.011003786075976735, 0.015261659306873846, 0.005861448176725054, 0.014197404427498183, -0.009348477784056751, 0.001508748897124968, 0.004832983771132771, 0.015660431554031247, 0.0027102730593902085, 0.008814471265298851, 0.0007124494481142226, -0.01980562149113306, -0.011705193974497596, -0.021088500554057664, 0.008029139026329303, 0.007906100148785447, -0.0050293677390145005, 0.004598818605690714, 0.01288197282373305, 0.0019985818494857362, -0.0009610838120021366, -0.009884054655126604, 0.0024357022213709134, 0.012948717323964892, 0.0005663188640924737, -0.012759916947888595, -0.0125181007098604, -0.0043214253406490925, 0.004463060299518863, -0.004011736857181069, 0.010519836135952572, 0.0011312114534223422, 0.0004696140363753536, 0.0003432172172384536, -0.0077895743099177985, -0.015070138537692564, -0.0006129757454745727, -0.010407455998208678, 0.006784417813883505, 0.0009305428516245453, -0.0029993475366289803, 0.014552672430512845, -0.006068566337220161, -0.0026182109946998296, 0.009921109261454581, 0.0035912564717283345, 0.017623934426679325, -0.016931131426021255, -0.0043476963315355836, -0.01688649578193005, -0.002444122707634422, 0.012788084711917957, 0.017191789017538976, -0.005217402291109259, -0.006889272867086969, 0.019999272023281212, 0.011462162705687104, 0.004388466801683122, 0.007426238767580121, 0.009693019589262868, -0.01698260422724512, 0.004961352447389226, -4.024703891638507e-05, 0.02526911056096486, 0.0011874797288272186, -0.00652823518249153, 0.013111833622945443, -0.024168031950550214, 0.02146749264215489, 0.01054356551881949, 0.010416879189972857, -0.007799658692672023, 0.007270533283876622, -0.005979523975559018, -0.0031663654474651717, -0.009802149196306157, -0.00797487724796218, 0.006275740056634165, 0.004689365206756456, 0.010981897570200634, 0.017571244474902444, 3.3613603858849254e-05, 0.01941032046932667, -0.018410830787732414, 0.03603207035154927, -0.02494755615112228, -0.008868679672866121, -0.002588540999618809, -0.0032309658742549737, 0.004436203767315611, -0.012707646525359833, 0.0026990307518849933, 0.023034414494979005, -0.022135450074177943, -0.016979783684242713, 0.016060598417249976, -0.0029703222901084894, 0.0006365588173920672, -0.014924524686725017, -0.021614276760543137, -0.01214719880174953, 0.005516411743448418, -0.023438791352094614, 0.016783556902721125, -0.00322509757826041, -0.007795322717673661, 0.010440146884110876, -0.014520936063575417, -0.0118869638959479, -0.017054619393173768, 0.016773963057388826, 0.0006731465835554864, 0.019056026533546308, 0.00468788118294802, -0.010888085873588535, 0.010956682962005656, -0.02307044704854138, 0.012623388766282601, -0.012586291065178268, 0.01591570552440746, -0.000233571574782246, 0.009394078670161178, -0.02016003601832443, 0.007925410257709417, -0.015500731852179667, -0.017403707149775294, 0.019638395320660505, -0.008226983313941467, 0.0072563856601428525, 0.00789731254705759, -0.009525867434355068, 0.0015214722427233315, -0.019907870854394983, 0.014759525578048539, 0.03146478444173347, -0.00232257221934462, -0.0037064575767984643, -0.0037026581651368974, 0.007257552308694369, 0.0023714866962917178, 0.008347531928318148, -0.01874337856546803, -0.028189805796073705, -0.0033943927961430998, -0.008705630325120273, 0.014063820176241042, -0.050022659310577894, -0.001701481564983834, 0.011205324695797662, -0.0071037323966818625, -0.012065907338695739, 0.018058317705523366, 0.020641054420682155, 0.0033267728062627774, 0.018513632066651242, -0.030076865528637982, -0.0171186412436265, 0.013046125114233791, -0.021724701991916187, -0.006226325893615631, 0.02699987736840614, -0.0035116095017259184, -0.009919669162252042, -0.025037841253673972, 0.010259294109324454, 0.025075723755315957, 0.012166292908286618, -0.03950490566683949, -0.00987296313969903, 0.001508513336771715, 0.016002706798865625, 0.0065495219125794426, -0.012518796307050767, 0.007877498277655128, -0.0218499315124444, 0.025668788979001628, 0.01923144933539747, 0.017533610211501568, 0.03190261690486691, -0.0068151094067382696, -0.0025793902988600933, 0.0009353954524340103, -0.03245322409009152, -0.016703503394383402, 0.000747762007928335, 0.009722132098117446, 0.01155620604426229, -0.011592467310062363, -0.010140085386050122, -0.004730583223919572, 0.035280387473601786, -0.017124372956140152, 0.021100080180435642, 0.012485198193918537, 0.0010331205541178395, -0.048177194507895774, 0.021135593327259096, 0.0167661946454859, 0.018093937333955624, -0.007226991007942414, -0.009580007082119152, -0.009322856842918854, -0.03403064964420109, -0.012959733698117173, -0.017869477267979734, 0.02666835070458147, -0.027976637598990925, 0.0031500054084905357, -0.04844543703229595, -0.018273565784308544, 0.024793701947967775, 0.03973971356294459, 0.036933556783367655, -0.027301894603775204, -0.019468115336918546, 0.03388020028031232, 0.03482632750160883, -0.03694222530437738, 0.03398493012792609, -0.01346836714450172, 0.013873765029362035, -0.01617435008303074, -0.016969099182114845, -0.007698336656984198, 0.015262314215634218, 0.0013259853495684792, 0.022189946279485662, -0.013403731417569786, 0.034990559638285346, 0.010302993120541153, 0.019983148966121453, -0.022826724436323075, 0.0135499177477591, 0.002036440770802875, -0.021140062245084867, -0.010547790496281809, -0.05666024382311188, -0.027134702380110245, 0.05622420250194107, 0.0032969753682722466, -0.0159199927438294, -0.008728771974739976, 0.004929981409840573, -0.0320833428989742, 0.021131864821391153, 0.011404288922190605, -0.013564718662228522, 0.07448263933445722, -0.006635187820011188, -0.03415446874185546, 0.011488823332784826, 0.05844935513319829, -0.020400591538024274, -0.03271875273282063, -0.0037985201492102064, 0.0075340231824826895, -0.025253267723510612, -0.0035058074555014236, 0.04345763691534728, -0.05527112365205787, -0.05626178997309323, -0.002118715483350705, -0.005462966525836668, -0.05707399782926463, -0.0597654592235524, 0.0072316948452640535, 0.021531367036489057, 0.06080826650243089, -0.0562748964223031, 0.02727076387669744, 0.004867991303020668, 0.019084508492258172, 0.08381430661922655, -0.09212346858050223, -0.013015813780533788, -0.025726083983199068, -0.029305674005748863, 0.0019509359289666523, -0.045315367117557154, -0.07062150026444641, -0.02788092428021234, 0.010894681171528612, 0.004922234692353628, -0.07454951783628512, 0.007634125202918139, -0.004745002999548935, -0.0735448727581704, -0.03782536055575199, -0.04357796072686308, -0.05503511469661398, 0.01187989787821144, -0.03340761241722627, 0.017832060497246515, -0.045843152801978794, -0.06656471128515709, -0.03838328745153723, -0.09509605839049033, 0.03634769245007256, -0.018439699133157152, -0.04286080043854474, 0.05424565438605581, 0.040151468490446277, -0.009674393707547281, -0.024854770723697378, -0.03476717590245413, 0.02699735146292918, -0.07414922056899809, -0.007409662147161723, -0.0741974308544868, -0.0065916614019449525, 0.016192821342796017, -0.0007771281476389091, -0.13375619996156082, -0.09848873918315944, 0.13670019959170823, -0.017492876026354274, 0.028250864752273046, -0.03320906267449991, -0.015529747633466137, -0.0033054038810249337, 0.045910322034598966, -0.0233699410368189, -0.1057025264877957, 0.009168820139385681, 0.021084212534880926, 0.0691368189368438, -0.07673331092598178, -0.027978419719556606, -0.10235578842286953, 0.08593239372444465, -0.0884594909245443, -0.04192715508410661, -0.014111149641420687, 0.007323104233759866, 0.08269921197311009, -0.03619433823112763, 0.10698471550171434, -0.023002459031582514, -0.15322625041760782, -0.08482364993421386, 0.10429642187775168, -0.02238860343911359, 0.03442605778189992, 0.07899224836892367, 0.0386164155279867, -8.108563555170572e-05, 0.15751110629013607, 0.014906863810798593, -0.11244607381463313, -0.03021308014485382, -0.10253395652299113, -0.05861777101172079, -0.0840327413936648, -0.09113338231962392, -0.21690648135170235, -0.08337213452240036, -0.01823454170047136, -0.051680320579441406, -0.018120931212862165, -0.13083018879649672, 0.08308420799296277, -0.017938534472066222, 0.05006219049866714, -0.09173522360863609, 0.004222144303931435, -0.16146182764004482, 0.19628169750959135, 0.023474649732004756, -0.05245815852479045, 0.07820528752268015, -0.06095817672499148, 0.03982224448455945, 0.013310700749029018, -0.017873486146654766, 0.012246046633033827, -0.014421543154054682, -0.11893972489585664, 0.09636160821595836, -0.1639061934609323, -0.0444074563820653, 0.01134964934349562, 0.09451689372284067, 0.09860703497121685, 0.0744735024954077, -0.07285432184914942, -0.006603611076126149, 0.18753297697747523, 0.08470906704564399, -0.062217935350841545, -0.10522439031448921, 0.07154346450218342, 0.13420676879199714, 0.0030220910375012237, 0.02551136861676226, -0.10513687606400267, 0.031832529794623574, -0.02860208942259642, 0.05332371606501931, 0.016456757497521658, -0.0887751605299686, 0.09694246094738061, 0.03593696408157788, 0.05693094709198879, -0.033845725982502524, -0.07787575951764142, -0.0010128096475896199, -0.05909988147166385, -0.1003900293574391, 0.07184917561411815, 0.065655138710126, 0.11504923573262857, 0.050249354964049094, 0.09252471016153341, 0.04903338672047832, -0.05762120619699891, 0.019164397898404566, -0.027924234883771316, -0.15131971154300253, -0.05510496054257936, -0.054484335438590784, -0.012316566195131917, -0.023112143247906353, 0.054089264548379895, 0.004259346933833301, -0.05072416683191316, -0.023314429185383262, -0.004358714746457026, 0.03789909250561671, -0.04427883750355917, 0.0013075534874205466, 0.048753896375418576, -0.016555293190348083, 0.09355012141833431, -0.03409080869361756, -0.06519578687618621, -0.09434295456192665, -0.04007383052043677, 0.10922885784416643, 0.009181143941754722, 0.0669894892249737, 0.01666679023371679, -0.030750139996297318, -0.033936453326726505, 0.043749402239256906, 0.011175586512049161, -0.0025842853260800436, 0.019557750726634404, -0.013136157275237008, 0.049908869534993086, 0.0303256939374257, -0.021971093727821165, -0.029448525549400712, -0.052361512476182794, 7.916935578694301e-05, 0.027896071084466352, 0.006424860229679335, -0.0241315751662448, -0.09636755839878186, -0.0196485432204383, 0.013734266267531717, -0.0076118436288361395, 0.016354539272454633, -0.011545236270854919, -0.001335212773033651, -0.0308760254104943, 0.016057317349913842, -0.039050849688028355, -0.03118751743736183, -0.013101851124853793, -0.03338303644197081, -0.009959532302760473, 0.02705830677365968, -0.02472080789207564, 0.06112733186581912, -0.012831284702012197, -0.015384339320674931, 0.003583808258295328, 0.0027780750058533877, 0.0044368887061811985, -0.009612972682931568, -0.022908859528074848, -0.015557695281663338, 0.01587994815149446, -0.009369992544073358, -0.03351488064031897, -0.006154503314337546, 0.009602837203876162, -0.024384855421565862, -0.005004444203890751, -0.0224494500234454, 0.026838294637197405, -0.025813884297304725, 0.03604017042473609, -0.010194242121692555, -0.030024039389982694, -0.031247277188347296, 0.00963406175955688, -0.021513143824950887, -0.01633963911000383, 0.002172368797039211, -0.005343571492873998, -0.008958191216376364, -0.011226658768235169, -0.007883381446107842, -0.012332381077692973, -0.0019506662810327147, -0.01208117240572394, -0.004988568388620383, 0.02124517792344231, 0.014074054688875771, 0.0041661643221091674, 0.01224857102608028, 0.0046258036009597945, 0.0035772056832539794, -0.0018569014035671291, 0.005085059826239759, -0.009653024685908259, 0.012889188202861078, -0.0008072616896703752, -0.0022681965527318448, 0.006213361860612752)

In [11]:
# Evaluate rmse

# for each user, predict rating for every movie
# check distance from actual, if review exists
total_error_squared = 0

for row in range(0, u.shape[0]):
    user_total_error_squared = 0
    user_evaluations = 0
    
    for col in range(0, u.shape[1]):
        predicted_rating = predict_rating(user_index = row, movie_index = col, ratings_matrix = matrix, u = u)
        actual_rating = matrix[row][col]
        if actual_rating != 0:
            user_evaluations += 1
            user_total_error_squared += (predicted_rating - actual_rating)**2
    
    user_total_error = math.sqrt(user_total_error_squared)
    print("User total error:", user_total_error)
    print("User evaluations:", user_evaluations)
    #break
total_error = math.sqrt(total_error_squared)
print("Total error:", total_error)

User total error: 16.05560880831214
User evaluations: 42
User total error: 4.013240739342899
User evaluations: 2
User total error: 6.988303485138862
User evaluations: 5


KeyboardInterrupt: 

In [11]:
# Find the highest similarity

lowest_similarity = np.inf
highest_similarity = -np.inf
highest_sim_col = -1
for col in range(1,vh.shape[1]):
    similarity = cosine_similarity(vh[:,0], vh[:,col])
    if similarity > highest_similarity:
        highest_similarity = similarity
        highest_sim_col = col
    if similarity < lowest_similarity:
        lowest_similarity = similarity

print("highest_similarity is %s" % (highest_similarity))
print("lowest_similarity is %s" % (lowest_similarity))

print("Column %d (movie id %s) is most similar to column 0 (movie id %s)" %
        (highest_sim_col, rating_matrix.columns[col], rating_matrix.columns[0])
)

highest_similarity is 0.08144239149393877
lowest_similarity is -0.06146904339534752
Column 542 (movie id 193609) is most similar to column 0 (movie id 1)


In [12]:
# Predict a rating
def predict_rating(user_id, movie_id, ratings_matrix, k=5):
    # Get the ratings for the target user and the movie
    target_user_ratings = ratings_matrix[user_id, :]
    movie_ratings = ratings_matrix[:, movie_id]
    
    predicted_rating = 0
    total_similarity = 0
    
     # Calculate cosine similarity between the target user and all other users
    for i, user_ratings in enumerate(ratings_matrix):
        if i == user_id:
            continue  # Skip the target user
        
        similarity = cosine_similarity(target_user_ratings, user_ratings)
        neighbor_rating = ratings_matrix[i, movie_id]
        
        if neighbor_rating != 0:  # Ignore if neighbor hasn't rated the movie
            predicted_rating += similarity * neighbor_rating
            total_similarity += similarity

    if total_similarity != 0:
        predicted_rating /= total_similarity
    
    return predicted_rating

# Example usage
# Assuming ratings_matrix is your pivot matrix where rows represent users and columns represent movies
# and user_id and movie_id are valid indices in your matrix
predicted_rating = predict_rating(user_id = 0, movie_id = 0, ratings_matrix = matrix)
print("Predicted rating:", predicted_rating)

Predicted rating: 3.9086166688351893
