In [1]:
import numpy as np
import pandas as pd

In [2]:
user_ids = []
movie_ids = []
ratings = []
timestamps = []

# Open the file as text, split each line and collect the data into a PD.DATAFRAME 
with open("./u.data", 'rt') as file1:
    for line in file1.readlines():
        a = line.split()
        user_ids.append(int(a[0]))
        movie_ids.append(int(a[1]))
        ratings.append(float(a[2]))
        timestamps.append(a[3])
    
rating_df = pd.DataFrame({'user_id': user_ids, 'movie_id': movie_ids, 'rating': ratings, 'timestamp': timestamps})
rating_df.sort_values(by=['user_id', 'movie_id', 'timestamp'], inplace=True)
rating_df.reset_index(drop=True, inplace=True)
rating_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5.0,874965758
1,1,2,3.0,876893171
2,1,3,4.0,878542960
3,1,4,3.0,876893119
4,1,5,3.0,889751712
...,...,...,...,...
99995,943,1067,2.0,875501756
99996,943,1074,4.0,888640250
99997,943,1188,3.0,888640250
99998,943,1228,3.0,888640275


In [3]:
def keep_movies_rated_by_at_least(df, perc):
    filtered = df.groupby('movie_id').filter(lambda x: len(x) >= perc*df.user_id.nunique())
    return filtered

rating_df = keep_movies_rated_by_at_least(rating_df, 0.33)

In [4]:
# if movie id and user id are the same in two rows take the one with bigger timerstamp

def remove_duplicates(df):
    df.drop_duplicates(subset=['user_id', 'movie_id'], keep='last', inplace=True)
    df.drop(columns = 'timestamp', inplace=True)
    return df

In [5]:
data = remove_duplicates(rating_df)
data

Unnamed: 0,user_id,movie_id,rating
0,1,1,5.0
6,1,7,4.0
49,1,50,5.0
55,1,56,4.0
68,1,69,3.0
...,...,...,...
99882,943,181,4.0
99896,943,204,3.0
99898,943,210,4.0
99913,943,237,4.0


In [6]:
temp = data.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)
df = pd.DataFrame(columns=temp.columns.values, index=temp.index.values, data=temp.values)

In [7]:
df

Unnamed: 0,1,7,50,56,69,79,98,100,117,121,...,237,258,269,286,288,294,300,313,405,748
1,5.0,4.0,5.0,4.0,3.0,4.0,4.0,5.0,3.0,4.0,...,2.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,4.0,3.0,4.0,4.0,3.0,1.0,4.0,5.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0
4,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,4.0,5.0,5.0,0.0,0.0,0.0
5,4.0,0.0,4.0,0.0,1.0,3.0,3.0,5.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
940,0.0,4.0,4.0,5.0,2.0,0.0,4.0,3.0,0.0,0.0,...,0.0,5.0,4.0,3.0,0.0,4.0,5.0,5.0,0.0,0.0
941,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,4.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0
942,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,...,0.0,4.0,2.0,0.0,0.0,0.0,5.0,3.0,0.0,0.0


In [8]:
# import train_test_split
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)



In [9]:
from scipy.linalg import svd

def do_svd(mat, k=0, option=False):
    U, Sigma, VT = svd(mat)
    U = pd.DataFrame(U[:,:k])
    VT = pd.DataFrame(VT[:k,:])
    if option:
        return Sigma[:k]
    else:
        return U, VT

In [10]:
keep = 18

In [11]:
U = do_svd(train, k=keep)[0]

In [12]:
VT = do_svd(train, k=keep)[1]

In [13]:
VT

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.207064,-0.179968,-0.295864,-0.202478,-0.16856,-0.185094,-0.212677,-0.229168,-0.171266,-0.180173,...,-0.164315,-0.190886,-0.100618,-0.147258,-0.151271,-0.142706,-0.134403,-0.136905,-0.142502,-0.09848
1,0.004588,0.009227,-0.022963,-0.151704,-0.152534,-0.17475,-0.162916,0.074983,0.042576,0.005726,...,0.097371,0.344364,0.223591,0.312783,0.238151,0.22512,0.356782,0.319942,0.003302,0.217539
2,-0.254198,-0.134728,-0.077911,0.203755,0.089568,0.057691,0.179281,0.021444,-0.303422,-0.311076,...,-0.19792,0.061211,0.319518,0.378339,0.041396,-0.061048,-0.018047,0.199469,-0.263521,-0.072002
3,0.076278,0.189425,0.06118,0.072093,-0.076617,-0.093722,0.061496,0.554561,0.02661,-0.035,...,0.170733,-0.164712,0.134095,0.188772,-0.086192,-0.210374,-0.208459,-0.259831,-0.03906,-0.18071
4,-0.08604,0.261388,-0.48715,0.280386,0.06639,0.086691,0.240239,0.193361,0.140278,0.046278,...,0.130913,-0.069345,-0.070495,-0.047916,0.289719,0.105145,0.032362,0.009477,0.110658,-0.004406
5,0.217389,-0.20181,-0.1728,-0.266974,0.175131,-0.026453,-0.168677,0.02739,0.016996,0.118402,...,0.276236,-0.190607,0.300086,0.348833,-0.439744,-0.058809,0.028636,0.063068,0.115198,-0.039903
6,-0.012554,0.259314,0.085628,-0.026036,-0.220352,-0.147432,-0.066982,0.092174,-0.170912,-0.067069,...,-0.271525,0.418107,0.201525,0.020398,-0.00756,-0.136239,-0.201219,-0.247809,-0.107701,0.011314
7,0.057355,0.104437,-0.019871,-0.085221,-0.119655,-0.121597,-0.122836,0.10959,0.024975,0.05565,...,-0.100643,-0.181676,0.059652,-0.224383,0.163631,-0.216625,-0.315639,0.733418,0.039164,-0.150596
8,-0.284191,0.010722,-0.164783,0.132607,-0.041327,0.225756,-0.063001,0.030942,0.218798,0.231594,...,-0.032852,0.392041,0.14521,-0.091088,-0.301092,-0.488523,0.064342,0.106989,0.221793,-0.076881
9,-0.042957,-0.086715,-0.053961,-0.048791,-0.165942,0.115462,-0.120746,-0.039465,0.074814,0.06191,...,-0.108076,-0.206342,0.646207,-0.416758,0.202493,0.088516,0.23174,-0.229585,0.061898,0.007055


In [14]:
sigma = do_svd(train, k=keep, option=True)

In [15]:
sigma = np.diag(sigma)

In [16]:
sigma.shape

(18, 18)

In [17]:
def rmse(true, pred):
    x = true - pred
    return np.sqrt(np.mean(x**2))

In [18]:
def test_(U, VT, test):
    predictions = []
    actual = []
    recc = []
    for i, user in test.iterrows():
        for movie, rating in user.items():
            if rating != 0 and movie in VT.T.columns:
                # print(user.values.reshape(1, -1).shape, VT.T.shape, sigma.shape)
                U_user = user.values.reshape(1, -1) @ VT.T @ np.linalg.inv(sigma)
                predicted_rating = np.dot(U_user, np.dot(sigma, VT.T.loc[movie]))[0]
                predictions.append(predicted_rating)
                actual.append(rating)
                recc.append((rating, predicted_rating, i))
                recc.sort(key=lambda x: x[0], reverse=True)
            
    predictions = np.array(predictions)
    actual = np.array(actual)
    return rmse(predictions, actual), recc
        

In [19]:
e, r = test_(U, VT, test)

In [20]:
r.sort(key=lambda x: (x[2], x[0], x[1]), reverse=True)

In [21]:
e

2.0957667704672907

In [22]:
r

[(4.0, 4.835539899226424, 938),
 (4.0, 3.841601533591482, 938),
 (3.0, 0.8141189112539091, 929),
 (3.0, 2.3841257720525597, 923),
 (5.0, 1.0258611993086915, 922),
 (5.0, 2.73913231536277, 893),
 (5.0, 4.209078074863237, 887),
 (4.0, 2.0479841433862274, 887),
 (5.0, 4.912698749956151, 886),
 (4.0, 3.029326790371479, 886),
 (5.0, 3.0937656919371257, 885),
 (3.0, 3.4102046502905243, 885),
 (3.0, 1.0429194260378596, 872),
 (5.0, 5.022388777140469, 868),
 (4.0, 4.96946643601361, 868),
 (4.0, 4.796266860526114, 854),
 (3.0, 4.3305745691810404, 854),
 (5.0, 4.934912947877928, 838),
 (5.0, 2.889440596623904, 838),
 (5.0, 5.3330846056660395, 831),
 (4.0, 3.8532074122565994, 831),
 (4.0, 3.2876738528867477, 817),
 (4.0, -0.1993244651844217, 817),
 (4.0, 1.1525431420991232, 777),
 (4.0, 5.023001764734895, 764),
 (4.0, 3.571169856168205, 764),
 (5.0, 4.861012118814106, 758),
 (4.0, 2.452095877397948, 742),
 (3.0, 4.862589551021109, 742),
 (5.0, 3.6394818963103113, 738),
 (4.0, 1.837237775862229, 7