Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for saving and loading models #377

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
60 changes: 60 additions & 0 deletions lightfm/lightfm.py
Expand Up @@ -20,6 +20,21 @@

CYTHON_DTYPE = np.float32

model_weights = {
"user_embeddings",
"user_biases",
"item_embeddings",
"item_biases",
"item_bias_momentum",
"item_bias_gradients",
"item_embedding_momentum",
"item_embedding_gradients",
"user_bias_momentum",
"user_bias_gradients",
"user_embedding_momentum",
"user_embedding_gradients",
}


class LightFM(object):
"""
Expand Down Expand Up @@ -474,6 +489,51 @@ def verbose_range():

return verbose_range()

def save(self, path):
"""
Saves a model as a numpy-object, keeping all model weights and hyperparameters
for re-initialization.
This does not keep track of any mappings of items/users you may have in your dataloaders,
so also needs to be stored somewhere for full restoration of the model.
Model is persisted as a compressed numpy file, and has the .npz extension
appended to the path-parameter.

Parameters
----------

path: string
string-path of location to save the model.
"""
model_params = {value: getattr(self, value) for value in model_weights}
hyperparams = self.get_params()
model_params.update(hyperparams)
np.savez_compressed(path, **model_params)

@staticmethod
def load_uncached(path):
"""
Loads a model saved in the format output by LightFM.save()
Example usage:
model = LightFM.load_uncached(path_to_saved_model)

Parameters
----------

path: string
string-path of location to load_uncached the model from.
"""
new_model = LightFM()

numpy_model = np.load(path)
for value in [x for x in numpy_model if x in model_weights]:
setattr(new_model, value, numpy_model[value])

new_model.set_params(
**{k: v for k, v in numpy_model.items() if k not in model_weights}
)

return new_model

def fit(
self,
interactions,
Expand Down
81 changes: 81 additions & 0 deletions tests/test_persist.py
@@ -0,0 +1,81 @@
import pytest

import numpy as np
import os

from sklearn.metrics import roc_auc_score

from lightfm.lightfm import LightFM
from lightfm.datasets import fetch_movielens


def _binarize(dataset):

positives = dataset.data >= 4.0
dataset.data[positives] = 1.0
dataset.data[np.logical_not(positives)] = -1.0

return dataset


def _cleanup():
os.remove(TEST_FILE_PATH)


TEST_FILE_PATH = "./tests/test.npz"
movielens = fetch_movielens()
train, test = _binarize(movielens["train"]), _binarize(movielens["test"])


def test_all_params_persisted():
model = LightFM(loss="warp")
model.fit(movielens["train"], epochs=1, num_threads=4)
model.save(TEST_FILE_PATH)

# Load and confirm all model params are present.
saved_model_params = list(np.load(TEST_FILE_PATH).keys())
for x in dir(model):
ob = getattr(model, x)
# We don't need to persist model functions, or magic variables of the model.
if not callable(ob) and not x.startswith("__"):
assert x in saved_model_params

_cleanup()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the test fails, this is never executed, and is never cleaned up. Could you use pytest fixtures for setup/teardown?

Copy link
Author

@NegatioN NegatioN Dec 15, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be done now. Needs another cache-clean. Also, if you do end up merging this, just squash everything. I don't think it makes sense to keep any of the history except the initial commit.



def test_model_populated():
model = LightFM(loss="warp")
model.fit(movielens["train"], epochs=1, num_threads=4)
model.save(TEST_FILE_PATH)

# Load a model onto an uninstanciated object
loaded_model = LightFM.load_uncached(TEST_FILE_PATH)

assert loaded_model.item_embeddings.any()
assert loaded_model.user_embeddings.any()

_cleanup()


def test_model_performance():
# Train and persist a model
model = LightFM(random_state=10)
model.fit_partial(train, epochs=10, num_threads=4)
model.save(TEST_FILE_PATH)

train_predictions = model.predict(train.row, train.col)
test_predictions = model.predict(test.row, test.col)

trn_pred = roc_auc_score(train.data, train_predictions)
tst_pred = roc_auc_score(test.data, test_predictions)

# Performance is same as before when loaded from disk
loaded_model = LightFM.load_uncached(TEST_FILE_PATH)

train_predictions = loaded_model.predict(train.row, train.col)
test_predictions = loaded_model.predict(test.row, test.col)

assert roc_auc_score(train.data, train_predictions) == trn_pred
assert roc_auc_score(test.data, test_predictions) == tst_pred

_cleanup()