In [None]:
import numpy as np
import pandas as pd
import sklearn
import numpy.ma as ma
import random
random.seed(3)

In [None]:
movies = pd.read_csv(
    'Assignment_1/ml-1m/movies.dat',
    sep = "::",
    names = ['MovieID', 'Title', 'Genres'],
    encoding='latin-1',
    engine='python',
)

movies.head()

In [None]:
ratings = pd.read_csv(
    'Assignment_1/ml-1m/ratings.dat',
    sep = "::",
    names = ['UserID', 'MovieID', 'Rating', 'Timestamp'],
    encoding='latin-1',
    engine='python',
)

ratings.head()

In [None]:
users = pd.read_csv(
    'Assignment_1/ml-1m/users.dat',
    sep = "::",
    names = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
    encoding='latin-1',
    engine='python',
)

users.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.linear_model import LinearRegression


In [None]:
## Global rating method

x = ratings.sample(frac = 1)
X = np.array_split(x, 5)

rmses = []
maes = []

for i in range(5):
    X_test = X[i]
    X_train = pd.concat([X[j] for j in range(5) if j!=i])
    average_score = np.mean(X_train['Rating'])
    reccomendations = np.array([average_score]*len(X_test))
    rmse = np.sqrt(np.mean((reccomendations - X_test['Rating'])**2))
    rmses += [rmse]
    mae = np.mean(abs(reccomendations - X_test['Rating']))
    maes += [mae]

In [None]:
## Per user rating

x = ratings.sample(frac = 1)
X = np.array_split(x, 5)

rmses = []
maes = []

for i in range(5):
    X_test = X[i]
    X_train = pd.concat([X[j] for j in range(5) if j!=i])
    reccomendations=[]
    rat=dict.fromkeys(np.unique(X_train['UserID']))
    
    for user in np.unique(X_train['UserID']):
        rat[user]=X_train.loc[X_train['UserID']==user]['Rating'].mean()
    for user in X_test['UserID']:
        if user in rat:
            reccomendations.append(rat[user])
        else:
            reccomendations.append(np.mean(X_train['Rating']))
    
    rmse = np.sqrt(np.mean((reccomendations - X_test['Rating'])**2))
    rmses += [rmse]
    mae = np.mean(abs(reccomendations - X_test['Rating']))
    maes += [mae]

In [None]:
## Per item rating

x = ratings.sample(frac = 1)
X = np.array_split(x, 5)

rmses = []
maes = []

for i in range(5):
    X_test = X[i]
    X_train = pd.concat([X[j] for j in range(5) if j!=i])
    reccomendations=[]
    rat=dict.fromkeys(np.unique(X_train['MovieID']))
    
    for movie in np.unique(X_train['MovieID']):
        rat[movie]=X_train.loc[X_train['MovieID']==movie]['Rating'].mean()
    for movie in X_test['MovieID']:
        if movie in rat:
            reccomendations.append(rat[movie])
        else:
            reccomendations.append(np.mean(X_train['Rating']))
    
    rmse = np.sqrt(np.mean((reccomendations - X_test['Rating'])**2))
    rmses += [rmse]
    mae = np.mean(abs(reccomendations - X_test['Rating']))
    maes += [mae]

In [None]:
##Linear regression
###TO DO:
# ADJUST FOR >5 OR <1
# ADJUST FOR WHEN THERE IS NO MOVIE/USER AVG
# MAKE ENTRIES INTEGER?

x = ratings.sample(frac = 1)
X = np.array_split(x, 5)
simrmses = [] #simple = no intercept
simmaes = []
rmses = []
maes = []

for i in range(5):
    X_test = X[i]
    X_train = pd.concat([X[j] for j in range(5) if j!=i])
    
    train=X_train.pivot(index='MovieID', columns='UserID', values='Rating') #Training set into matrix form
    colavg=train.mean() #User average
    rowavg=train.mean(axis=1) #Movie average
    average_score = np.mean(X_train['Rating']) #Average score (in case missing values for test set)
    
    indexes=train[train.notnull()].stack().index #Indexes of all the non-NA values
    lr=[]
    for k in indexes:
        lr.append((rowavg[k[0]],colavg[k[1]],train.loc[k[0],k[1]])) #Create a list of (Movie_avg, User_avg, Rating)
        
    df = pd.DataFrame(lr, columns = ['x1', 'x2','y'])
    simplereg=LinearRegression(fit_intercept=False).fit(df.iloc[:,0:2],df.iloc[:,2]) #Linear regression without intercept
    reg=LinearRegression().fit(df.iloc[:,0:2],df.iloc[:,2]) #Linear regression with intercept
    
    for i in range(ratings['MovieID'].max()):
        if (i not in rowavg):
            rowavg[i]=average_score #If in the test set there is no movie average, use global average
    for i in range(ratings['UserID'].max()):
        if (i not in colavg):
            colavg[i]=average_score
            
    #Recommendation computed with linear regression without intercept
    simplereccom = simplereg.coef_[0]*rowavg[X_test['MovieID']].reset_index()[0]+simplereg.coef_[1]*colavg[X_test['UserID']].reset_index()[0]
    #Recommendation computed with linear regression with intercept
    reccom = reg.intercept_+reg.coef_[0]*rowavg[X_test['MovieID']].reset_index()[0]+reg.coef_[1]*colavg[X_test['UserID']].reset_index()[0]
    
    simrmse = np.sqrt(np.mean((simplereccom - X_test['Rating'].reset_index()['Rating'])**2))
    simrmses += [simrmse]
    simmae = np.mean(abs(simplereccom - X_test['Rating'].reset_index()['Rating']))
    simmaes += [simmae]
    
    rmse = np.sqrt(np.mean((reccom - X_test['Rating'].reset_index()['Rating'])**2))
    rmses += [rmse]
    mae = np.mean(abs(reccom - X_test['Rating'].reset_index()['Rating']))
    maes += [mae]

In [None]:
#MATRIX FACTORIZATION
# google colab / runtime - cambia tipo - GPU/TPU
x = ratings.sample(frac = 1)
X = np.array_split(x, 5)
learn_rate=0.005
reg=0.05
num_factors=10
num_iter=75

rmses = []
maes = []

for i in range(1):
    X_test = X[i]
    X_train = pd.concat([X[j] for j in range(5) if j!=i])
    train=X_train.pivot(index='MovieID', columns='UserID', values='Rating')
    test = X_test.pivot(index='MovieID', columns='UserID', values='Rating')
    
    M=np.random.rand(train.shape[0],num_factors)
    M=pd.DataFrame(M, index=train.index)
    U=np.random.rand(num_factors,train.shape[1])
    U=pd.DataFrame(U, columns=train.columns)
    
    indexes=train[train.notnull()].stack().index
    iterat=1
    rmse=100
    newrmse=99
    
    while (iterat < num_iter and newrmse-rmse < 0.01 ):
        print('Set: ' + str(i) + ', iteration: ' + str(iterat))
        iterat += 1
        for k in indexes:
            movie = M.loc[k[0],]
            user = U.loc[:,k[1]]
            x_hat = np.dot(movie,user)
            error = train.loc[k[0],k[1]] - x_hat
            M.loc[k[0],] = movie + learn_rate * (2*error*user - reg*movie)
            U.loc[:,k[1]] = user + learn_rate * (2*error*movie - reg*user)
            
        factoriz=pd.DataFrame(np.matmul(M, U), index=train.index, columns= train.columns)
        rmse=newrmse
        newrmse=np.sqrt(((factoriz[factoriz.columns & test.columns]-test)**2).mean().mean())
            
    #rmse = np.sqrt(np.mean((np.matmul(M, U)-train)**2))
    rmses += [newrmse]
    mae = abs(factoriz[factoriz.columns & test.columns]-test).mean().mean()
    maes += [mae]

In [None]:
np.mean(ma.masked_invalid(train))

In [None]:
np.mean(simrmses)

In [None]:
np.mean(simmaes)

In [None]:
np.mean(rmses)

In [None]:
np.mean(maes)

In [None]:
rmses

In [None]:
simmaes