In [1]:
#normal tools:
from scipy import sparse
import numpy as np
import copy
import sys
sys.path.append("..")
import utils

#learning library:
import lightfm

#skopt:
from skopt.space import Real, Integer
from skopt import Optimizer
from sklearn.externals.joblib import Parallel, delayed
from skopt.utils import use_named_args
import skopt



# This notebook compares wide- to long-format input data
The interaction matrix is shape (n_users, n_items). If you have many users and few(er) items, there is a large difference in aspect ratio. Because of this, the WARP algorithm will behave differently. 

For example, with `n_users` down the vertical length, each protein gets updated `n_users` times per epoch. Conversely, each user will be updated `epoch` times in total. If you were to tranpose the matrix, each protein would recieve `n_items` updates per epoch - much fewer updates!

The below code just demonstrates how, even if you're looking to optimize the ranking of ligands to be recommended to proteins, it's better to run the algorithm so that it optimizes the rankings of proteins to be recommended to ligands. The reason is that there are 100k's of ligands, so the proteins get 100k's of little updates each epoch. 



# First, run HPO
This determines the 'best' hyperparameters for running LightFM with 'wide-format' data (i.e. shape (`n_items`, `n_users`), which is wider than it is long. 

Later this can be compared to the typical 'long-format' input, parameters for which are already available in file `hpo_lightfm_warp.dat`.

In [23]:
#load the 243-protein subset:
interaction_matrix = utils.load_subset()

##lightfm requires a particular way of getting the predictions:
#lightfm 'user id' (chemical id)
cid = np.arange(interaction_matrix.T.shape[0])
#lightfm 'item id' (target id)
tid = np.arange(interaction_matrix.T.shape[1])


#this performs multiple repeats of the test/train split, if desired:
def bootstrap(params, matrix, repeats):
    results = list()
    for _ in range(repeats):
        #load a dataset:
        train, test = utils.train_test_split(interaction_matrix, 0.05)
        train = train.T
        test = test.T
        test = np.array(test.todense(), dtype=bool)
        
        #fit the model:
        model = lightfm.LightFM(no_components = params['no_components'],
                           loss='warp',
                           max_sampled=params['max_sampled'],
                           learning_rate=params['learning_rate'])
        model.fit(train, epochs=params['epochs'])
        
        #make interaction predictions:
        pred_matrix = model.predict(np.repeat(cid, len(tid)), np.tile(tid, len(cid)))
        pred_matrix = np.reshape(pred_matrix, (len(cid), len(tid)))
        
        #evaluate by calculating mean rank:
        #order from highest to lowest:
        order = (-pred_matrix).argsort(axis=1)
        #get ranks of each ligand.
        ranks = order.argsort(axis=1)
        mean_rank = np.mean(ranks[test])
        results.append(mean_rank)
        #results.append(-utils.evaluate_predictions(pred_matrix, train, test))
    return np.mean(results)




In [24]:
####SKOPT:

#these are the hyperparameters and search spaces:
space = [Integer(1, 400, name='no_components'),
        Integer(1,15, name='max_sampled'),
        Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
        Integer(1,20, name='epochs')]

#the objective function for skopt:
@use_named_args(space)
def score(**params):
    score = bootstrap(params, interaction_matrix, 1)
    return (score)

optimizer = Optimizer(dimensions=space,
                     random_state=1,
                     base_estimator='ET',
                     n_random_starts=12)


for i in range(5):
    print(i)
    x = optimizer.ask(n_points=4)
    y = Parallel(n_jobs=4)(delayed(score)(v) for v in x)
    optimizer.tell(x,y)

result = skopt.utils.create_result(optimizer.Xi,
                                  optimizer.yi,
                                  optimizer.space,
                                  optimizer.rng,
                                  models=optimizer.models)


0
1
2
3
4


In [25]:
result.fun

5457.995290423862

In [26]:
##These are the best hyperparameters for wide-format data:

result.x_iters[np.argmin(result.func_vals)]

[20, 5, 0.013877212573012852, 17]

# Now, compare wide- and long- format using a single hyperparameter set.

This uses the best parameters from `hpo_lightfm_warp.dat`

It shows that mean rank (determined wide-ways) is better using the long-format input. 

'wide-ways' means the mean rank is a lot larger! That's simply due to there being a lot of ligands. It means that the ranks go all the way up ~100k's. Whereas calculating the long-ways ranks only goes up to about 250 (the number of proteins)

In [13]:
train, test = utils.train_test_split(interaction_matrix, 0.05)
test = np.array(test.todense(), dtype=bool)

cid = np.arange(train.shape[0])
tid = np.arange(train.shape[1])
model = lightfm.LightFM(no_components = 127,
                           loss='warp',
                           max_sampled=9,
                           learning_rate=0.0561)
model.fit(train, epochs=6)

pred_matrix = model.predict(np.repeat(cid, len(tid)), np.tile(tid, len(cid)))
pred_matrix = np.reshape(pred_matrix, (len(cid), len(tid)))

#trained in wide format, now convert to long format to compare with long-format training:
pred_matrix = pred_matrix.T
#order from highest to lowest:
order = (-pred_matrix).argsort(axis=1)
#get ranks of each ligand.
ranks = order.argsort(axis=1)
print('long format mean rank:')
print(np.mean(ranks[test.T])) #use test.T to compare fairly

long format mean rank:
4155.326234867898


In [18]:
train = train.T

cid = np.arange(train.shape[0])
tid = np.arange(train.shape[1])
model = lightfm.LightFM(no_components = 127,
                           loss='warp',
                           max_sampled=9,
                           learning_rate=0.0561)
model.fit(train, epochs=6)

pred_matrix = model.predict(np.repeat(cid, len(tid)), np.tile(tid, len(cid)))
pred_matrix = np.reshape(pred_matrix, (len(cid), len(tid)))

#order from highest to lowest:
order = (-pred_matrix).argsort(axis=1)
#get ranks of each ligand.
ranks = order.argsort(axis=1)
print('wide format mean rank:')
print(np.mean(ranks[test.T])) #use test.T to compare fairly

wide format mean rank:
5585.520190863303


# Next, compare wide- and long- format with their respective best hyperparameters

Long input format is _still_ better. 

In [27]:
model = lightfm.LightFM(no_components = 20,
                           loss='warp',
                           max_sampled=5,
                           learning_rate=0.01387)
model.fit(train, epochs=17)



pred_matrix = model.predict(np.repeat(cid, len(tid)), np.tile(tid, len(cid)))
pred_matrix = np.reshape(pred_matrix, (len(cid), len(tid)))

#order from highest to lowest:
order = (-pred_matrix).argsort(axis=1)
#get ranks of each ligand.
ranks = order.argsort(axis=1)
print('wide format mean rank:')
print(np.mean(ranks[test.T])) #use test.T to compare fairly

wide format mean rank:
5441.103030838562
