In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import myfm

# Load data

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

def load_cil(dataset="split"):

    file = "data_train" if dataset != "test" else "sampleSubmission"
    data = pd.read_csv(f'../data/{file}.csv', index_col=0)
    data['user'] = data.index.str.split('_').str[0].str[1:].astype('int32')
    data['movie'] = data.index.str.split('_').str[1].str[1:].astype('int32')
    data.rename(columns={'Prediction': 'rating'}, inplace=True)
    data['rating'] = data['rating'].astype('uint8')
    data = data[['user', 'movie', 'rating']]

    data['user'] = data['user'] - 1
    data['movie'] = data['movie'] - 1
    print("Subtracted {} from user and movie".format(1))

    user_num = 10000  # int(data['user'].max() + 1)
    movie_num = 1000  # int(data['movie'].max() + 1)
    print("User num: {}, Movie num: {}".format(user_num, movie_num))

    train_data = val_data = None
    if dataset == "test":
        val_data = data
    elif dataset == "train":
        train_data = data
    else:
        train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

    return train_data, val_data, user_num, movie_num

In [3]:
train_data, val_data, user_num, movie_num = load_cil("split")

Subtracted 1 from user and movie
User num: 10000, Movie num: 1000


# BFM

In [4]:
FM_RANK = 10

FEATURE_COLUMNS = ['user', 'movie']
ohe = OneHotEncoder(handle_unknown='ignore')

X_train = ohe.fit_transform(train_data[FEATURE_COLUMNS])
X_test = ohe.transform(val_data[FEATURE_COLUMNS])
y_train = train_data.rating.values
y_test = val_data.rating.values

In [11]:
fm_grouped = myfm.MyFMRegressor(
    rank=FM_RANK, random_seed=42,
)
fm_grouped.fit(
    X_train, y_train, n_iter=200, n_kept_samples=200,
    group_shapes=[len(group) for group in ohe.categories_]
)

alpha = 1.14 w0 = 3.79 : 100%|██████████| 200/200 [00:33<00:00,  5.90it/s]


<myfm.gibbs.MyFMGibbsRegressor at 0x136907610>

In [12]:
prediction_grouped = fm_grouped.predict(X_test)
prediction_grouped = np.clip(prediction_grouped, 1, 5)

In [13]:
rmse = ((y_test - prediction_grouped) ** 2).mean() ** .5
mae = np.abs(y_test - prediction_grouped).mean()
print(f'rmse={rmse}, mae={mae}')

rmse=0.9797415528913666, mae=0.7833792304512732


# Decompose into U and V

In [None]:
# get all predictions
number_of_samples = fm_grouped.w0_samples.shape[0]
prediction_matrix_samples = np.zeros((number_of_samples, 10000, 1000))
for sample in range(fm_grouped.w0_samples.shape[0]):
    # make all shape 10000 x 1000
    w0 = np.ones((10000, 1000)) * fm_grouped.w0_samples[sample]

    user_bias = fm_grouped.w_samples[sample][:10000]
    user_bias = np.repeat(user_bias, 1000).reshape(10000, 1000)

    movie_bias = fm_grouped.w_samples[sample][10000:]
    movie_bias = np.tile(movie_bias, 10000).reshape(10000, 1000)

    interaction = np.dot(fm_grouped.V_samples[sample][:10000], fm_grouped.V_samples[sample][10000:].T)

    prediction_matrix_samples[sample] = w0 + user_bias + movie_bias + interaction

In [14]:
w0 = fm_grouped.w0_samples # 200 x ,
user_bias = fm_grouped.w_samples[:, :10000] # 200 x 10000
movie_bias = fm_grouped.w_samples[:, 10000:] # 200 x 1000
U = fm_grouped.V_samples[:, :10000] # 200 x 10000 x 10
V = fm_grouped.V_samples[:, 10000:] # 200 x 1000 x 10

In [15]:
print(w0.shape)
print(user_bias.shape)
print(movie_bias.shape)
print(U.shape)
print(V.shape)

(200,)
(200, 10000)
(200, 1000)
(200, 10000, 10)
(200, 1000, 10)


In [16]:
# save as npy
np.save("bfm_export/w0.npy", w0)
np.save("bfm_export/user_bias.npy", user_bias)
np.save("bfm_export/movie_bias.npy", movie_bias)
np.save("bfm_export/U.npy", U)
np.save("bfm_export/V.npy", V)