In [84]:
from collections import defaultdict
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import myfm
from myfm import RelationBlock
from scipy import sparse as sps

from sklearn.model_selection import train_test_split
import pandas as pd

def load_cil(dataset="split"):

    file = "data_train" if dataset != "test" else "sampleSubmission"
    data = pd.read_csv(f'../data/{file}.csv', index_col=0)
    data['user'] = data.index.str.split('_').str[0].str[1:].astype('int32')
    data['movie'] = data.index.str.split('_').str[1].str[1:].astype('int32')
    data.rename(columns={'Prediction': 'rating'}, inplace=True)
    data['rating'] = data['rating'].astype('uint8')
    data = data[['user', 'movie', 'rating']]

    data['user'] = data['user']
    data['movie'] = data['movie']
    # print("Subtracted {} from user and movie".format(1))

    user_num = 10000  # int(data['user'].max() + 1)
    movie_num = 1000  # int(data['movie'].max() + 1)
    print("User num: {}, Movie num: {}".format(user_num, movie_num))

    train_data = val_data = None
    if dataset == "test":
        val_data = data
    elif dataset == "train":
        train_data = data
    else:
        train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

    return train_data, val_data, user_num, movie_num

In [85]:
train_data, val_data, user_num, movie_num = load_cil("train")

User num: 10000, Movie num: 1000


In [86]:
_, val_data, user_num, movie_num = load_cil("test")

User num: 10000, Movie num: 1000


In [87]:
# index "0" is reserved for unknown ids.
user_to_index = defaultdict(lambda : 0, { uid: i+1 for i,uid in enumerate(np.unique(train_data.user)) })
movie_to_index = defaultdict(lambda: 0, { mid: i+1 for i,mid in enumerate(np.unique(train_data.movie))})
USER_ID_SIZE = len(user_to_index) + 1
MOVIE_ID_SIZE = len(movie_to_index) + 1

In [88]:
# The flags to control the included features.
use_iu = True # use implicit user feature
use_ii = True # use implicit item feature

movie_vs_watched = dict()
user_vs_watched = dict()
for row in train_data.itertuples():
    user_id = row.user
    movie_id = row.movie
    movie_vs_watched.setdefault(movie_id, list()).append(user_id)
    user_vs_watched.setdefault(user_id, list()).append(movie_id)
X_date_train, X_date_test = (None, None)

In [89]:
# given user/movie ids, add additional infos and return it as sparse
def augment_user_id(user_ids):
    Xs = []
    X_uid = sps.lil_matrix((len(user_ids), USER_ID_SIZE))
    for index, user_id in enumerate(user_ids):
        X_uid[index, user_to_index[user_id]] = 1
    Xs.append(X_uid)
    if use_iu:
        X_iu = sps.lil_matrix((len(user_ids), MOVIE_ID_SIZE))
        for index, user_id in enumerate(user_ids):
            watched_movies = user_vs_watched.get(user_id, [])
            normalizer = 1 / max(len(watched_movies), 1) ** 0.5
            for uid in watched_movies:
                X_iu[index, movie_to_index[uid]] = normalizer
        Xs.append(X_iu)
    return sps.hstack(Xs, format='csr')

def augment_movie_id(movie_ids):
    Xs = []
    X_movie = sps.lil_matrix((len(movie_ids), MOVIE_ID_SIZE))
    for index, movie_id in enumerate(movie_ids):
        X_movie[index, movie_to_index[movie_id]] = 1
    Xs.append(X_movie)

    if use_ii:
        X_ii = sps.lil_matrix((len(movie_ids), USER_ID_SIZE))
        for index, movie_id in enumerate(movie_ids):
            watched_users = movie_vs_watched.get(movie_id, [])
            normalizer = 1 / max(len(watched_users), 1) ** 0.5
            for uid in watched_users:
                X_ii[index, user_to_index[uid]] = normalizer
        Xs.append(X_ii)


    return sps.hstack(Xs, format='csr')

In [90]:
train_uid_unique, train_uid_index = np.unique(train_data.user, return_inverse=True)
train_mid_unique, train_mid_index = np.unique(train_data.movie, return_inverse=True)
user_data_train = augment_user_id(train_uid_unique)
movie_data_train = augment_movie_id(train_mid_unique)

test_uid_unique, test_uid_index = np.unique(val_data.user, return_inverse=True)
test_mid_unique, test_mid_index = np.unique(val_data.movie, return_inverse=True)
user_data_test = augment_user_id(test_uid_unique)
movie_data_test = augment_movie_id(test_mid_unique)

In [91]:
block_user_train = RelationBlock(train_uid_index, user_data_train)
block_movie_train = RelationBlock(train_mid_index, movie_data_train)
block_user_test = RelationBlock(test_uid_index, user_data_test)
block_movie_test = RelationBlock(test_mid_index, movie_data_test)

In [93]:
y_train = train_data.rating.values
y_test = val_data.rating.values

In [94]:
FM_RANK = 10
N_ITER = 512
N_KEPT_SAMPLES = 200

In [95]:
fm_rb = myfm.MyFMRegressor(rank=FM_RANK).fit(
    X_date_train, y_train,
    X_rel=[block_user_train, block_movie_train],
    group_shapes=[USER_ID_SIZE, MOVIE_ID_SIZE, MOVIE_ID_SIZE, USER_ID_SIZE],
    n_iter=N_ITER, n_kept_samples=N_KEPT_SAMPLES
)

alpha = 1.13 w0 = 3.61 : 100%|██████████| 512/512 [03:55<00:00,  2.17it/s]


In [98]:
test_prediction = fm_rb.predict(
    X_date_test,
    X_rel=[block_user_test, block_movie_test]
)
val_data['Prediction'] = test_prediction

In [83]:
rmse = ((y_test - test_prediction) ** 2).mean() ** 0.5
mae = np.abs(y_test - test_prediction).mean()
print(f'rmse={rmse}, mae={mae}')

rmse=0.973083165848429, mae=0.7761185158641162


In [None]:
# 512, 50 ->  rmse=0.9774482247691249
# 512, 10 -> rmse=0.975...

In [99]:
val_data['Prediction'].to_csv(f"../lightning_logs/BFM/predictions_bfm_plus_plus_grouped.csv")

# Investigate std of predictions

In [129]:
# get all predictions

def get_prediction_matrix(fm_rb):
    number_of_samples = fm_rb.w0_samples.shape[0]
    prediction_matrix_samples = np.zeros((number_of_samples, 10000, 1000))
    for sample in range(fm_rb.w0_samples.shape[0]):
        # make all shape 10000 x 1000
        w0 = np.ones((10000, 1000)) * fm_rb.w0_samples[sample]

        user_bias = fm_rb.w_samples[sample][:10000]
        user_bias = np.repeat(user_bias, 1000).reshape(10000, 1000)

        movie_bias = fm_rb.w_samples[sample][10000:]
        movie_bias = np.tile(movie_bias, 10000).reshape(10000, 1000)

        interaction = np.dot(fm_rb.V_samples[sample][:10000], fm_rb.V_samples[sample][10000:].T)

        prediction_matrix_samples[sample] = w0 + user_bias + movie_bias + interaction
    return prediction_matrix_samples

# get single prediction
def make_prediction(fm_rb, user, movie, watched_movies, interested_users):
    predictions = []
    for i in range(2): # range(fm_rb.w0_samples.shape[0]):
        w0 = fm_rb.w0_samples[i]
        user_bias = fm_rb.w_samples[i][user]
        movie_bias = fm_rb.w_samples[i][10001 + 1001 + movie]
        u = fm_rb.V_samples[i][user]
        v = fm_rb.V_samples[i][10001 + 1001 + movie]

        movie_sum = 0
        for j in watched_movies:
            v_j = fm_rb.V_samples[i][10001 + j]
            movie_sum += np.dot(v, v_j)
        movie_sum /= np.sqrt(len(watched_movies))

        user_sum = 0
        for j in interested_users:
            u_j = fm_rb.V_samples[i][10001 + 1001 + 1001 + j]
            user_sum += np.dot(u, u_j)
        user_sum /= np.sqrt(len(interested_users))

        prediction = w0 + user_bias + movie_bias + np.dot(u, v) + movie_sum + user_sum
        predictions.append(prediction)
    prediction = np.mean(predictions)
    std = np.std(predictions)
    return prediction, std

# this will be more complicated as now
# w0_samples is of shape (200,)
# w_samples is of shape (200, 22004)
# V_samples is of shape (200, 22004, 10)
# have to find out what exactly Bayesian formula is

In [116]:
val_data['user'][0], val_data['movie'][0], val_data['rating'][0], val_data['Prediction'][0]

(37, 1, 3, 3.279220736846226)

In [130]:
item = 0
watched_movies = user_data_test[test_uid_index[item]].nonzero()[1][1:] - 10001
interested_users = movie_data_test[test_mid_index[item]].nonzero()[1][1:] - 1001
make_prediction(fm_rb, val_data['user'][item], val_data['movie'][item], watched_movies, interested_users)

(2.1946875576300475, 0.42387251073583576)