Skip to content

Commit

Permalink
Lighten algorithms (#464)
Browse files Browse the repository at this point in the history
* SVD should not have a redundant big matrix in its pickle, or should it?

* Make ALS SVD WALS zippable

* Fix logger
  • Loading branch information
jilljenn committed Sep 14, 2017
1 parent 734fa73 commit a3b2ec9
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 28 deletions.
14 changes: 10 additions & 4 deletions mangaki/mangaki/algo/als.py
Expand Up @@ -78,14 +78,20 @@ def fit(self, X, y):
self.factorize(matrix, random_state=42)
if self.verbose_level:
print('Shapes', self.U.shape, self.VT.shape)
self.M = self.U.dot(self.VT)

#self.save('backup.pickle')

self.chrono.save('factor matrix')

def unzip(self):
self.chrono.save('begin of fit')
self.M = self.U.dot(self.VT)
self.chrono.save('end of fit')

def predict(self, X):
return self.M[X[:, 0].astype(np.int64), X[:, 1].astype(np.int64)] + self.means[X[:, 0].astype(np.int64)]
if self.M is not None: # Model is unzipped
M = self.M
else:
M = self.U.dot(self.VT)
return M[X[:, 0].astype(np.int64), X[:, 1].astype(np.int64)] + self.means[X[:, 0].astype(np.int64)]

def get_shortname(self):
return 'als-%d' % self.nb_components
1 change: 0 additions & 1 deletion mangaki/mangaki/algo/fit_algo.py
Expand Up @@ -2,7 +2,6 @@

from mangaki.algo.recommendation_algorithm import RecommendationAlgorithm


def fit_algo(algo_name, triplets, titles=None, categories=None, output_csv=False):
algo = RecommendationAlgorithm.instantiate_algorithm(algo_name)
dataset = Dataset()
Expand Down
2 changes: 1 addition & 1 deletion mangaki/mangaki/algo/knn.py
Expand Up @@ -32,7 +32,7 @@ def __init__(self, nb_neighbors=20, rated_by_neighbors_at_least=3, missing_is_me

def load(self, filename):
backup = super().load(filename)
self.nb_neighbors = backup.NB_NEIGHBORS
self.nb_neighbors = backup.nb_neighbors
self.closest_neighbors = backup.closest_neighbors
self.rated_works = backup.rated_works
self.mean_score = backup.mean_score
Expand Down
2 changes: 2 additions & 0 deletions mangaki/mangaki/algo/recommendation_algorithm.py
Expand Up @@ -12,6 +12,7 @@ def __init__(self):
self.algorithm_factory = {}
self.logger = logging.getLogger(__name__ + '.' + self.__class__.__name__)
self.initialized = False
self.size = 0

def initialize(self):
# FIXME: make it less complicated and go for a commonly used design pattern.
Expand Down Expand Up @@ -54,6 +55,7 @@ def is_serializable(self):
def save(self, filename):
with open(self.get_backup_path(filename), 'wb') as f:
pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
self.size = os.path.getsize(self.get_backup_path(filename)) / 1e6

def load(self, filename):
"""
Expand Down
18 changes: 10 additions & 8 deletions mangaki/mangaki/algo/svd.py
Expand Up @@ -10,9 +10,6 @@ class MangakiSVD(RecommendationAlgorithm):
U = None
sigma = None
VT = None
inv_work = None
inv_user = None
work_titles = None
def __init__(self, nb_components=20, nb_iterations=10):
super().__init__()
self.nb_components = nb_components
Expand All @@ -24,9 +21,6 @@ def load(self, filename):
self.U = backup.U
self.sigma = backup.sigma
self.VT = backup.VT
self.inv_work = backup.inv_work
self.inv_user = backup.inv_user
self.work_titles = backup.work_titles
self.means = backup.means

@property
Expand Down Expand Up @@ -55,12 +49,20 @@ def fit(self, X, y):
self.U, self.sigma, self.VT = randomized_svd(matrix, self.nb_components, n_iter=self.nb_iterations, random_state=42)
if self.verbose_level:
print('Shapes', self.U.shape, self.sigma.shape, self.VT.shape)
self.M = self.U.dot(np.diag(self.sigma)).dot(self.VT)

self.chrono.save('factor matrix')

def unzip(self):
self.chrono.save('begin of fit')
self.M = self.U.dot(np.diag(self.sigma)).dot(self.VT)
self.chrono.save('end of fit')

def predict(self, X):
return self.M[X[:, 0].astype(np.int64), X[:, 1].astype(np.int64)] + self.means[X[:, 0].astype(np.int64)]
if self.M is not None: # Model is unzipped
M = self.M
else:
M = self.U.dot(np.diag(self.sigma)).dot(self.VT)
return M[X[:, 0].astype(np.int64), X[:, 1].astype(np.int64)] + self.means[X[:, 0].astype(np.int64)]

def get_shortname(self):
return 'svd-%d' % self.nb_components
30 changes: 18 additions & 12 deletions mangaki/mangaki/algo/wals.py
Expand Up @@ -25,27 +25,24 @@ def simple_train(model, inp, num_iterations):
class MangakiWALS(RecommendationAlgorithm):
M = None
U = None
VT = None
V = None

def __init__(self, nb_components=20):
"""An implementation of the Weighted Alternate Least Squares.
NB_COMPONENTS: the number of components in the factorization"""
import tensorflow as tf

super().__init__()
self.nb_components = nb_components
self.sess = tf.InteractiveSession()

def load(self, filename):
backup = super().load(filename)
self.M = backup.M
self.U = backup.U
self.VT = backup.VT
self.V = backup.V
self.means = backup.means

@property
def is_serializable(self):
return False # FIXME: serialize me!
return True

def make_matrix(self, X, y):
matrix = defaultdict(dict)
Expand All @@ -68,6 +65,7 @@ def make_matrix(self, X, y):
return indices, values, means

def factorize(self, indices, values):
import tensorflow as tf
from tensorflow.contrib.factorization.python.ops import factorization_ops
from tensorflow.python.framework import sparse_tensor

Expand All @@ -87,26 +85,34 @@ def factorize(self, indices, values):
row_weights=None, # row_wts,
col_weights=None, # col_wts,
use_factors_weights_cache=use_factors_weights_cache)
tf.InteractiveSession()
simple_train(model, inp, 25)
row_factor = model.row_factors[0].eval()
print('Shape', row_factor.shape)
self.U = row_factor
col_factor = model.col_factors[0].eval()
print('Shape', col_factor.shape)
out = np.dot(row_factor, np.transpose(col_factor))
return out
self.V = col_factor

def fit(self, X, y):
print("Computing M: (%i × %i)" % (self.nb_users, self.nb_works))
indices, values, self.means = self.make_matrix(X, y)

self.chrono.save('fill and center matrix')

self.M = self.factorize(indices, values)
self.factorize(indices, values)

self.chrono.save('factor matrix')

def unzip(self):
self.chrono.save('begin of fit')
self.M = self.U.dot(self.V.T)
self.chrono.save('end of fit')

def predict(self, X):
return self.M[X[:, 0].astype(np.int64), X[:, 1].astype(np.int64)] + self.means[X[:, 0].astype(np.int64)]
if self.M is not None: # Model is unzipped
M = self.M
else:
M = self.U.dot(self.V.T)
return M[X[:, 0].astype(np.int64), X[:, 1].astype(np.int64)] + self.means[X[:, 0].astype(np.int64)]

def get_shortname(self):
return 'wals'
4 changes: 2 additions & 2 deletions mangaki/mangaki/management/commands/fit_algo.py
Expand Up @@ -23,5 +23,5 @@ def handle(self, *args, **options):
titles = {work_id: title for work_id, title, _ in meta_triplets}
categories = {work_id: cat_id for work_id, _, cat_id in meta_triplets}

fit_algo(algo_name, triplets, titles=titles, categories=categories, output_csv=output_csv)
self.stdout.write(self.style.SUCCESS('Successfully fit %s' % algo_name))
_, algo = fit_algo(algo_name, triplets, titles=titles, categories=categories, output_csv=output_csv)
self.stdout.write(self.style.SUCCESS('Successfully fit %s (%.1f MB)' % (algo_name, algo.size)))

0 comments on commit a3b2ec9

Please sign in to comment.