In [2]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

from scipy.sparse import csr_matrix

from dataprep import transform_indices
from evaluation import topn_recommendations

import numpy as np
from lightfm import LightFM



In [3]:
data, genres = get_movielens_data(include_time=True, get_genres=True)

In [4]:
data = data.head(10000)

In [5]:
genres['genreid'], _ = pd.factorize(genres['genreid'])
genres = genres.loc[:, ['movieid','genreid']]

In [6]:
training_, holdout_ = leave_one_out(data, target='timestamp', sample_top=True, random_state=0)

training = training_.sort_values('userid')
holdout = holdout_.sort_values('userid')

In [7]:
train_matrix = csr_matrix((training['rating'], (training['userid'], training['movieid'])))
holdout_matrix = csr_matrix((holdout['rating'], (holdout['userid'], holdout['movieid'])))

In [8]:
lfm_config = dict(
    no_components = 60,
    loss = 'warp',
    max_sampled = 1,
    max_epochs = 60,
    learning_schedule = 'adagrad',
    user_alpha = 1e-3,
    item_alpha = 1e-3,
    random_state = 7032023
)

In [14]:
topn = 10
trashholds = [24,25,26,27,28,29,30,31,32,33,34,35]
# trashholds = [36,37,38,39,40,41,42,43,44,45]
user_ratings_count = training.groupby('userid')['rating'].count()
res = []

In [10]:
def build_lfm_model(config, data, data_description, early_stop_config=None, iterator=None):
    # the model
    model = LightFM(
        no_components = config['no_components'],
        loss = config['loss'],
        learning_schedule = config['learning_schedule'],
        learning_rate = config.get('learning_rate', 0.05), # default as in the docs
        user_alpha = config['user_alpha'],
        item_alpha = config['item_alpha'],
        max_sampled = config['max_sampled'],
        random_state = config.get('random_state', None)
    )
    # early stoppping configuration
    es_config = check_early_stop_config(early_stop_config)

    # training
    if iterator is None:
        iterator = lambda x: x
    for epoch in iterator(range(config['max_epochs'])):
        try:
            train_lfm_epoch(epoch, model, data, data_description, es_config)
        except StopIteration:
            break
    return model


def check_early_stop_config(early_stop_config):
    if early_stop_config is None:
        early_stop_config = {}
    try:
        es_dict = {
            'early_stopper': early_stop_config['evaluation_callback'],
            'callback_interval': early_stop_config['callback_interval'],
            'holdout': early_stop_config['holdout'],
            'data_description': early_stop_config['data_description'],
            'stop_early': True
        }
    except KeyError: # config is invalid, doesn't contain required keys
        es_dict = {'stop_early': False} # disable early stopping
    return es_dict


def train_lfm_epoch(
    epoch, model, train, data_description, es_config,
):
 
    model.fit_partial(
        train,
        user_features = data_description['user_features'],
        item_features = data_description['item_features'],
        epochs = 1
    )
    if not es_config['stop_early']:
        return

    metrics_check_interval = es_config['callback_interval']
    if (epoch+1) % metrics_check_interval == 0:
        # evaluate model and raise StopIteration if early stopping condition is met
        early_stopper_call = es_config['early_stopper']
        early_stopper_call(
            epoch,
            model,
            es_config['holdout'],
            es_config['data_description']
        )

In [11]:
def lightfm_scoring_base(model, data_description, start_users):
    dtype = 'i4'
    all_users = np.arange(data_description['n_users'], dtype=dtype)
    test_items = start_users.astype(dtype)
    item_index, user_index = np.meshgrid(test_items, all_users, copy=False)

    print(data_description)
    
    lfm_scores = model.predict(
        user_index.ravel(),
        item_index.ravel(),
        item_features = data_description['item_features'],
        user_features = data_description['user_features']
    )
    scores = lfm_scores.reshape(len(test_items), len(all_users), order='F')

    return scores

In [15]:
for warm_start_threshold in trashholds:
    cold_start_users = user_ratings_count[user_ratings_count <= warm_start_threshold].index
    warm_start_users = user_ratings_count[user_ratings_count > warm_start_threshold].index  

    print(cold_start_users.shape[0])
    print(warm_start_users.shape[0])

    if cold_start_users.shape[0]*warm_start_users.shape[0] > 0:

        cold_training = training[training['userid'].isin(cold_start_users)]

        cold_common_ids = set(cold_training['movieid']) & set(genres['movieid'])
        cold_genres = genres[genres['movieid'].isin(cold_common_ids)]
        cold_training = cold_training[cold_training['movieid'].isin(cold_common_ids)]

        cold_genre_matrix = pd.crosstab(cold_genres.movieid, cold_genres.genreid)
        cold_movie_id = cold_genre_matrix.copy()
        cold_genre_matrix = csr_matrix(cold_genre_matrix)

        cold_movie_id = cold_movie_id.index

        cold_rating_matrix = pd.pivot_table(data = cold_training, values='rating', index='userid', columns='movieid', fill_value=2.5)
        cold_rating_matrix = csr_matrix(cold_rating_matrix)

        cold_data_description = dict(
            users = 'userid',
            items = 'movieid',
            n_users = len(cold_start_users),
            n_items = cold_genre_matrix.shape[0],
            #user_features = cold_rating_matrix,
            user_features = None,
            # item_features = cold_genre_matrix
            item_features = None
        )
        
        cold_model = build_lfm_model(
            lfm_config,
            cold_rating_matrix,
            cold_data_description,
            iterator = tqdm
        )

        cold_scores = lightfm_scoring_base(cold_model, cold_data_description, cold_start_users)
        cold_recs = topn_recommendations(cold_scores, topn=topn)

        warm_training = training[training['userid'].isin(warm_start_users)]
        warm_common_ids = set(warm_training['movieid']) & set(genres['movieid'])
        warm_genres = genres[genres['movieid'].isin(warm_common_ids)]


        warm_training = warm_training[warm_training['movieid'].isin(warm_common_ids)]

        warm_genre_matrix = pd.crosstab(warm_genres.movieid, warm_genres.genreid)
        warm_movie_id = warm_genre_matrix.copy()

        warm_genre_matrix = csr_matrix(warm_genre_matrix)
        warm_movie_id = warm_movie_id.index

        warm_rating_matrix = pd.pivot_table(data = warm_training, values='rating', index='userid', columns='movieid', fill_value=2.5)
        warm_rating_matrix = csr_matrix(warm_rating_matrix)

        warm_data_description = dict(
            users = 'userid',
            items = 'movieid',
            n_users = len(warm_start_users),
            n_items = warm_genre_matrix.shape[0],
            user_features = warm_rating_matrix,
            # user_features = None,
            item_features = warm_genre_matrix # compound features are not present in the dataset
        )
        
        warm_model = build_lfm_model(
            lfm_config,
            warm_rating_matrix,
            warm_data_description,
            iterator = tqdm
        )

        warm_scores = lightfm_scoring_base(warm_model, warm_data_description, warm_start_users)
        warm_recs = topn_recommendations(warm_scores, topn=topn)

        cold_holdout = holdout[holdout.userid.isin(cold_start_users)]
        warm_holdout = holdout[holdout.userid.isin(warm_start_users)]

        cold_coin = 0
        for i in range(0, len(cold_start_users)):
            # print(i)
            # cold_holdout.movieid.iloc[i]
            if cold_holdout.movieid.iloc[i] in cold_movie_id[cold_recs[i,:]]:
                cold_coin += 1

        warm_coin = 0
        for i in range(0, len(warm_start_users)):
            # print(i)|
            # cold_holdout.movieid.iloc[i]
            if warm_holdout.movieid.iloc[i] in warm_movie_id[warm_recs[i,:]]:
                warm_coin += 1

        res.append([warm_start_threshold, cold_coin, warm_coin, cold_start_users.shape[0], warm_start_users.shape[0]])

    print(res)

10
60


100%|██████████| 60/60 [00:00<00:00, 1007.25it/s]

{'users': 'userid', 'items': 'movieid', 'n_users': 10, 'n_items': 172, 'user_features': None, 'item_features': None}



100%|██████████| 60/60 [07:55<00:00,  7.93s/it]


{'users': 'userid', 'items': 'movieid', 'n_users': 60, 'n_items': 2144, 'user_features': <60x2144 sparse matrix of type '<class 'numpy.float64'>'
	with 128640 stored elements in Compressed Sparse Row format>, 'item_features': <2144x18 sparse matrix of type '<class 'numpy.int64'>'
	with 3978 stored elements in Compressed Sparse Row format>}
[[24, 0, 2, 10, 60]]
11
59


100%|██████████| 60/60 [00:00<00:00, 818.96it/s]

{'users': 'userid', 'items': 'movieid', 'n_users': 11, 'n_items': 187, 'user_features': None, 'item_features': None}



100%|██████████| 60/60 [07:37<00:00,  7.62s/it]


{'users': 'userid', 'items': 'movieid', 'n_users': 59, 'n_items': 2142, 'user_features': <59x2142 sparse matrix of type '<class 'numpy.float64'>'
	with 126378 stored elements in Compressed Sparse Row format>, 'item_features': <2142x18 sparse matrix of type '<class 'numpy.int64'>'
	with 3975 stored elements in Compressed Sparse Row format>}
[[24, 0, 2, 10, 60], [25, 0, 0, 11, 59]]
12
58


100%|██████████| 60/60 [00:00<00:00, 705.77it/s]

{'users': 'userid', 'items': 'movieid', 'n_users': 12, 'n_items': 200, 'user_features': None, 'item_features': None}



100%|██████████| 60/60 [07:29<00:00,  7.49s/it]


{'users': 'userid', 'items': 'movieid', 'n_users': 58, 'n_items': 2140, 'user_features': <58x2140 sparse matrix of type '<class 'numpy.float64'>'
	with 124120 stored elements in Compressed Sparse Row format>, 'item_features': <2140x18 sparse matrix of type '<class 'numpy.int64'>'
	with 3972 stored elements in Compressed Sparse Row format>}
[[24, 0, 2, 10, 60], [25, 0, 0, 11, 59], [26, 0, 1, 12, 58]]
12
58


100%|██████████| 60/60 [00:00<00:00, 784.14it/s]


{'users': 'userid', 'items': 'movieid', 'n_users': 12, 'n_items': 200, 'user_features': None, 'item_features': None}


100%|██████████| 60/60 [07:25<00:00,  7.42s/it]


{'users': 'userid', 'items': 'movieid', 'n_users': 58, 'n_items': 2140, 'user_features': <58x2140 sparse matrix of type '<class 'numpy.float64'>'
	with 124120 stored elements in Compressed Sparse Row format>, 'item_features': <2140x18 sparse matrix of type '<class 'numpy.int64'>'
	with 3972 stored elements in Compressed Sparse Row format>}
[[24, 0, 2, 10, 60], [25, 0, 0, 11, 59], [26, 0, 1, 12, 58], [27, 0, 1, 12, 58]]
12
58


100%|██████████| 60/60 [00:00<00:00, 799.70it/s]


{'users': 'userid', 'items': 'movieid', 'n_users': 12, 'n_items': 200, 'user_features': None, 'item_features': None}


100%|██████████| 60/60 [07:27<00:00,  7.46s/it]


{'users': 'userid', 'items': 'movieid', 'n_users': 58, 'n_items': 2140, 'user_features': <58x2140 sparse matrix of type '<class 'numpy.float64'>'
	with 124120 stored elements in Compressed Sparse Row format>, 'item_features': <2140x18 sparse matrix of type '<class 'numpy.int64'>'
	with 3972 stored elements in Compressed Sparse Row format>}
[[24, 0, 2, 10, 60], [25, 0, 0, 11, 59], [26, 0, 1, 12, 58], [27, 0, 1, 12, 58], [28, 0, 1, 12, 58]]
12
58


100%|██████████| 60/60 [00:00<00:00, 736.08it/s]


{'users': 'userid', 'items': 'movieid', 'n_users': 12, 'n_items': 200, 'user_features': None, 'item_features': None}


100%|██████████| 60/60 [07:20<00:00,  7.33s/it]


{'users': 'userid', 'items': 'movieid', 'n_users': 58, 'n_items': 2140, 'user_features': <58x2140 sparse matrix of type '<class 'numpy.float64'>'
	with 124120 stored elements in Compressed Sparse Row format>, 'item_features': <2140x18 sparse matrix of type '<class 'numpy.int64'>'
	with 3972 stored elements in Compressed Sparse Row format>}
[[24, 0, 2, 10, 60], [25, 0, 0, 11, 59], [26, 0, 1, 12, 58], [27, 0, 1, 12, 58], [28, 0, 1, 12, 58], [29, 0, 1, 12, 58]]
13
57


100%|██████████| 60/60 [00:00<00:00, 607.00it/s]


{'users': 'userid', 'items': 'movieid', 'n_users': 13, 'n_items': 217, 'user_features': None, 'item_features': None}


100%|██████████| 60/60 [07:24<00:00,  7.41s/it]


{'users': 'userid', 'items': 'movieid', 'n_users': 57, 'n_items': 2140, 'user_features': <57x2140 sparse matrix of type '<class 'numpy.float64'>'
	with 121980 stored elements in Compressed Sparse Row format>, 'item_features': <2140x18 sparse matrix of type '<class 'numpy.int64'>'
	with 3972 stored elements in Compressed Sparse Row format>}
[[24, 0, 2, 10, 60], [25, 0, 0, 11, 59], [26, 0, 1, 12, 58], [27, 0, 1, 12, 58], [28, 0, 1, 12, 58], [29, 0, 1, 12, 58], [30, 0, 1, 13, 57]]
13
57


100%|██████████| 60/60 [00:00<00:00, 645.13it/s]


{'users': 'userid', 'items': 'movieid', 'n_users': 13, 'n_items': 217, 'user_features': None, 'item_features': None}


100%|██████████| 60/60 [07:20<00:00,  7.35s/it]


{'users': 'userid', 'items': 'movieid', 'n_users': 57, 'n_items': 2140, 'user_features': <57x2140 sparse matrix of type '<class 'numpy.float64'>'
	with 121980 stored elements in Compressed Sparse Row format>, 'item_features': <2140x18 sparse matrix of type '<class 'numpy.int64'>'
	with 3972 stored elements in Compressed Sparse Row format>}
[[24, 0, 2, 10, 60], [25, 0, 0, 11, 59], [26, 0, 1, 12, 58], [27, 0, 1, 12, 58], [28, 0, 1, 12, 58], [29, 0, 1, 12, 58], [30, 0, 1, 13, 57], [31, 0, 1, 13, 57]]
13
57


100%|██████████| 60/60 [00:00<00:00, 658.43it/s]


{'users': 'userid', 'items': 'movieid', 'n_users': 13, 'n_items': 217, 'user_features': None, 'item_features': None}


  3%|▎         | 2/60 [00:22<10:43, 11.09s/it]


KeyboardInterrupt: 

In [13]:
print(res)

[[36, 0, 1], [37, 0, 1], [38, 0, 1], [39, 0, 1], [40, 0, 0], [41, 0, 0], [42, 1, 2], [43, 1, 2], [44, 1, 2], [45, 1, 2]]
