In [11]:
import math
import os
import time
from time import gmtime, strftime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

## Constants

In [17]:
# used to distinguish whether jupyter notebook is part of gitlab repo,
# and whether plots/charts/dataframes should be exported and where to
def is_development_env():
    dir_names = os.getcwd().split(os.sep)
    return not(dir_names[-1] == 'michal_salasek' and dir_names[-2] == 'playground')

DEVELOPMENT = is_development_env()

data_relative_path = './data/' if DEVELOPMENT else '../../data/'
ratings_path = os.path.abspath(data_relative_path + 'user_ratings.csv')

matrices_export_path = os.path.abspath('../../app/db')
ADD_TIMESTAMP = DEVELOPMENT

# column names for convenience
ITEM = 'BGGId'
GAME = ITEM
USER = 'Username'
RATING = 'Rating'

SHOW = DEVELOPMENT  # print info, charts etc.

CUTOFF_THRESHOLD = 10
k_latent_factors = 3
n_epochs = 30
learning_rate = 0.005
_lambda = 0.02
n_epochs_per_RMSE = 1  # how often should we compute RMSE (once every three epochs etc.)
chunk_size = 32 * 2
epoch_improvement_threshold = 0.005 # improvement in RMSE of validation set being less than `value` will result in early stopping
EARLY_STOP = True # apply early stopping

## Data processing

In [13]:
# for each Username and board game, keep only the last rating (if a user has rated a game multiple times)

def keep_last_rating(df):
    df = df.drop_duplicates(['Username', 'BGGId'], keep='last')
    return df

In [14]:
# remove users with less than `threshold` ratings
# remove games with less than `threshold` ratings
def drop_less_than_n(df, col, threshold):
    return df[df[col].map(df[col].value_counts()) >= threshold]

In [15]:
def get_processed_dataframe(threshold = CUTOFF_THRESHOLD):
    users_threshold = threshold
    games_threshold = threshold
    ratings = pd.read_csv(ratings_path)
    ratings = keep_last_rating(ratings)
    ratings = drop_less_than_n(ratings, 'Username', users_threshold)
    ratings = drop_less_than_n(ratings, 'BGGId', games_threshold)
    ratings['Rating'] = ratings['Rating'].astype('float32')
    return ratings

In [16]:
df = get_processed_dataframe()

In [18]:
if SHOW:
    print(df.info())

In [19]:
unique_games = df['BGGId'].unique()
unique_users = df['Username'].unique()

if SHOW:
    rows = len(unique_users)
    cols = len(unique_games)
    print(f'rows (=users): {rows}')
    print(f'cols (=games): {cols}')
    print(f'number of cells = {rows} x {cols} = {rows*cols}')

In [20]:
def get_occurrence(df, col):
    return df.groupby([col])[col].count().sort_values(ascending=False)

def get_user_occurrence(df):
    return get_occurrence(df, USER)

def get_game_occurrence(df):
    return get_occurrence(df, ITEM)

In [21]:
def plot_barchart(series, title, y_label='games', head=True):

    # print top 10 labels (usernames / BGGId's)
    labels = series.head(10) if head else series.tail(10)
    print(labels.index.tolist())
    
    n_items = len(series)
    
    x = np.arange(n_items)    
    y = series.values
    
    fig, ax = plt.subplots()
    plt.title(title + f'_{n_items}')
    plt.xlabel('# of ratings')
    plt.ylabel(f'# of {y_label} with given rating')
    plt.bar(x, y)


In [22]:
# > split all data into training+validation and test set (80/20)
# > split the training+validation set into training and validation set (80/20 or 75/25)

# we use 'BGGId' column as 'stratify' for train_test_split to get a rating for every game
# that way we get proportionally many ratings for game 'X' in train / validation / test set
# the same can be done for 'Username'

def train_val_test_split(df, stratify_col=None):    
    train_val_size = 0.8
    test_size = 1 - train_val_size
    stratify = df[[stratify_col]] if stratify_col else None
    X_train_val, X_test = train_test_split(df, train_size=train_val_size, test_size=test_size, stratify=stratify)

    train_size = 0.8
    val_size = 1 - train_size
    
    stratify = X_train_val[[stratify_col]] if stratify_col else None
    X_train, X_val = train_test_split(X_train_val, train_size=train_size, test_size=val_size, stratify=stratify)
    
    return [X_train, X_val, X_test]

X_train, X_val, X_test = train_val_test_split(df, GAME)

In [23]:
# just a quick verification that data is split properly (as described above)
def plot_charts_for_data_split(X_train, X_val, X_test, occurrence=GAME):
    
    if occurrence == GAME:
        X_train_occ = get_game_occurrence(X_train)
        X_val_occ = get_game_occurrence(X_val)
        X_test_occ = get_game_occurrence(X_test)
    else:
        X_train_occ = get_user_occurrence(X_train)
        X_val_occ = get_user_occurrence(X_val)
        X_test_occ = get_user_occurrence(X_test)

    plot_barchart(X_train_occ.head(50), f"X_train_{occurrence}_top")
    plot_barchart(X_val_occ.head(50), f"X_val_{occurrence}_top")
    plot_barchart(X_test_occ.head(50), f"X_test_{occurrence}_top")
    
    plot_barchart(X_train_occ.tail(50), f"X_train_{occurrence}_bottom", head=False)
    plot_barchart(X_val_occ.tail(50), f"X_val_{occurrence}_bottom", head=False)
    plot_barchart(X_test_occ.tail(50), f"X_test_{occurrence}_bottom", head=False)

if SHOW:
    plot_charts_for_data_split(X_train, X_val, X_test)


In [24]:
def get_unique_count(df, col):
    return len(df[col].unique())

def print_unique_count(X_train, X_val, X_test, col):
    X_train_unique = get_unique_count(X_train, col)
    X_val_unique = get_unique_count(X_val, col)
    X_test_unique = get_unique_count(X_test, col)
    
    print(f'X_train_unique_{col}: {X_train_unique}')
    print(f'X_val_unique_{col}: {X_val_unique}')
    print(f'X_test_unique_{col}: {X_test_unique}')

if SHOW:
    print(f'[ total unique games: {get_unique_count(df, GAME)} ]')
    print_unique_count(X_train, X_val, X_test, GAME)
    print()
    print(f'[ total unique users: {get_unique_count(df, USER)} ]')
    print_unique_count(X_train, X_val, X_test, USER)
    
    

In [25]:
if SHOW:
    print("\n" + 20 * "=" + "[ X_train ]" + 20 * "=")
    print(X_train.info())
    print("\n" + 20 * "=" + "[ X_val ]" + 20 * "=")
    print(X_val.info())
    print("\n" + 20 * "=" + "[ X_test ]" + 20 * "=")
    print(X_test.info())

In [26]:
def add_timestamp(name):
    return name + "_" + strftime("%Y-%m-%d_%H-%M-%S", gmtime())


def timer_function(func):
    def wrap_func(*args, **kwargs):
        t1 = time.time()
        result = func(*args, **kwargs)
        t2 = time.time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func

In [27]:
def save_df(df, name, method='pickle', timestamp=True):
    
    if timestamp:
        name = add_timestamp(name)
    
    if method == 'h5':
        store = pd.HDFStore('store.h5')
        store[name] = df
    
    elif method == 'pickle':
        df.to_pickle(f'{name}.pkl')
    
    
def load_df(name, method='pickle'):
    
    if method == 'h5':
        store = pd.HDFStore('store.h5')
        return store[name]
    
    if method == 'pickle':
        return pd.read_pickle(f'{name}.pkl')
    
# save DFs into .pkl files so that they can be loaded for the endpoint
def save_dfs(P, Q, add_timestamp=ADD_TIMESTAMP):
    save_df(P, 'user_factors', 'pickle', add_timestamp)
    save_df(Q, 'item_factors', 'pickle', add_timestamp)

## Training

In [28]:
# if we wanna consider global effects, add variables for user/item bias and global mean later:
# return mean + user_bias + item_bias + np.dot(user_factors, item_factors.T)
def rating_prediction(user_factors, item_factors):
    return np.dot(user_factors, item_factors.T)

# currently not used due to this being applicable to a single user/item_factors vector

In [29]:
def RMSE(P, Q, ratings):
    user_ids = ratings['Username']
    item_ids = ratings['BGGId']

    user_factors = P.loc[user_ids].values
    item_factors = Q.loc[item_ids].values

    actual_ratings = ratings['Rating'].values
    predicted_ratings = np.einsum('ij, ij->i', user_factors, item_factors)
    
    error = mean_squared_error(predicted_ratings, actual_ratings)
    return sqrt(error)

In [30]:
# the rule for early stopping probably could be adjusted based on number of epochs, e.g.:
# a) if there are 100 epochs, it might make sense to only evaluate after e.g. every 5 epochs,
#    and if the validation error has increased, stop training
# b) if there are only < 10 epochs, we might wanna evaluate validation error after every epoch,
#    and stop training after validation error has increased once/twice/... in a row
def should_early_stop(validation_errors):
    if len(validation_errors) < 2:
        return False
    error_increased = validation_errors[-1] > validation_errors[-2]
    improvement_value = validation_errors[-2] - validation_errors[-1]
    return error_increased or improvement_value < epoch_improvement_threshold


def plot_errors(train_error, val_error, n_epochs):
    
    if len(train_error) < 2:
        return
    
    x = [i for i in range(len(train_error))]
    
    # plot lines
    plt.plot(x, train_error, label = "train_error")
    plt.plot(x, val_error, label = "val_error")
    plt.legend()
    plt.show()

In [31]:
def print_training_info():
    print()
    print(20*"=" + '[ training summary ]' + 20*"=")
    print()
    print(f'k latent factors: {k_latent_factors}')
    print(f'n epochs: {n_epochs}')
    print(f'chunk size: {chunk_size}')
    print(f'learning rate: {learning_rate}')
    print(f'regularization coefficient: {_lambda}')
    print(f'RMSE frequence: once every {n_epochs_per_RMSE} epochs ... how often is RMSE calculated')
    print()

In [32]:
def split_dataframe(df, chunk_size): 
    chunks = list()
    num_chunks = math.ceil(len(df) / chunk_size)
    for i in range(num_chunks):
        chunks.append(df.iloc[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [33]:
def get_initialized_matrix(unique_elements, col_prefix, k_latent_factors):
    # users_factors matrix of dimensions U x k
    matrix_data = 3 * np.random.rand(len(unique_elements), k_latent_factors)
    matrix = pd.DataFrame(data = matrix_data, 
                  index = unique_elements,
                  columns = [f'{col_prefix}_{i+1}' for i in range(k_latent_factors)])
    return matrix

In [37]:
# uses minibatch
def fit(X_train, X_val, unique_users, unique_items):
    
    # users_factors matrix of dimensions U x k
    P = get_initialized_matrix(unique_users, 'user_feature', k_latent_factors)
    
    # items_factors matrix of dimensions I x k
    Q = get_initialized_matrix(unique_items, 'item_feature', k_latent_factors)

    train_error = []
    val_error = []
    early_stop_flag = False

    if SHOW:
        print_training_info()
    
    for epoch in range(n_epochs):
        
        if SHOW:
            print()
            print(10*"=" + f'[ epoch #{epoch+1} ]' + 10*"=")
            print()
        
        epoch_start = time.time()
        batch = 0
        chunks = split_dataframe(X_train, chunk_size)
        
        for chunk in chunks:
            
            batch += 1
            if SHOW and batch % (len(chunks) // 5) == 1:
                print(f'batch #{batch}')
            
            user_ids = chunk['Username']
            item_ids = chunk['BGGId']
            
            user_factors = P.loc[user_ids].values
            item_factors = Q.loc[item_ids].values
            
            actual_rating = chunk['Rating'].values
            # pair-wise multiplication + sum = dot product for user_factors[i], item_factors[i]
            predicted_rating = np.einsum('ij, ij->i', user_factors, item_factors)
            
            error = actual_rating - predicted_rating
            
            # allow to multiply shape of (chunks, k_latent_factors) /=factors/ with (chunks,) /=error/
            # error[0] value multiplies each element of user/item_factors[0] (u1, u2, u3)
            P_err = item_factors * error[:, None]
            Q_err = user_factors * error[:, None]

            P_gradient = learning_rate * (P_err - _lambda * user_factors)
            Q_gradient = learning_rate * (Q_err - _lambda * item_factors)
            
            
            P.loc[user_ids] += P_gradient
            Q.loc[item_ids] += Q_gradient
        
        # either every n-th epoch (starting at first) or when it's the last epoch
        should_compute_RMSE = (epoch % n_epochs_per_RMSE == 0) or (epoch == n_epochs - 1)
        if should_compute_RMSE and EARLY_STOP:
            
            train_rmse = RMSE(P, Q, X_train)
            val_rmse = RMSE(P, Q, X_val)

            train_error.append(train_rmse)
            val_error.append(val_rmse)
        
            if SHOW:
                print()
                print(f'X_train RMSE: {train_rmse}')
                print(f'X_val RMSE: {val_rmse}')
            
            # if the validation error has increased, it might be a sign of overfitting
            early_stop_flag = should_early_stop(val_error)
        
        epoch_end = time.time()
        if SHOW:
            print()
            print(f'epoch took {epoch_end - epoch_start} seconds')
    
        if early_stop_flag:
            if SHOW:
                print()
                print(f'Early stopping after {epoch+1} epochs')
            break
    
    if SHOW:
        plot_errors(train_error, val_error, epoch+1)
        
    return [P, Q]

In [39]:
start = time.time()

P, Q = fit(X_train, X_val, unique_users, unique_games)

end = time.time()

if SHOW:
    print(f'{end - start} seconds; X_train size: {len(X_train)}, X_val size: {len(X_val)}')
    print(f'X_test error: {RMSE(P, Q, X_test)}')

In [159]:
save_dfs(P, Q)
