In [2]:
!pip install lightfm
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
import random
from tqdm.notebook import tqdm
import json
from lightfm.data import Dataset
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm.cross_validation import random_train_test_split
from lightfm import LightFM
import dask.dataframe as dd

np.random.seed = 42

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.1/310.1 KB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.16-cp38-cp38-linux_x86_64.whl size=917982 sha256=7b34f2cc177017ce39fb6b793eadf495b2a14a32e8bfa65574f621985fa7bc59
  Stored in directory: /root/.cache/pip/wheels/ec/bb/51/9c487d021c1373b691d13cadca0b65b6852627b1f3f43550fa
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16


In [3]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [6]:
full_df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/rec sys/ratings_df.csv')
ratings_df = full_df.drop(columns=['count'])

In [7]:
ratings_df_train, ratings_df_test = train_test_split(ratings_df,
                                   stratify=ratings_df['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(ratings_df_train))
print('# interactions on Test set: %d' % len(ratings_df_test))

# interactions on Train set: 499252
# interactions on Test set: 124813


In [8]:
ratings_df = ratings_df.set_index('user_id')
ratings_df_train = ratings_df_train.set_index('user_id')
ratings_df_test = ratings_df_test.set_index('user_id')

In [13]:
pivot_train = ratings_df.pivot_table(index='user_id', columns='track_id', values='rating').fillna(0)
csr_coll_matrix_train = csr_matrix(pivot_train)

In [14]:
NUMBER_OF_FACTORS_MF = 15

U, sigma, Vt = svds(csr_coll_matrix_train, k = NUMBER_OF_FACTORS_MF)

sigma = np.diag(sigma)

In [15]:
predicted_ratings_train = np.dot(np.dot(U, sigma), Vt) 
predicted_ratings_train_norm = (predicted_ratings_train - 
                                   predicted_ratings_train.min()) / (predicted_ratings_train.max()
                                                                        - predicted_ratings_train.min())

In [16]:
cf_preds_df = pd.DataFrame(predicted_ratings_train_norm,
                           columns = pivot_train.columns,
                           index=list(pivot_train.index)).transpose()

In [None]:
def dcg_at_k(r, k):
    """Score is discounted cumulative gain (dcg)
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    return np.sum(r / np.log2(np.arange(2, r.size + 2)))

def ndcg_at_k(r, k):
    """Score is normalized discounted cumulative gain (ndcg)
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

In [35]:
interactions_df = (ratings_df_train
                   .reset_index()
                   .groupby('user_id')['track_id']
                   .agg(lambda x: list(x)).reset_index()
                    .rename(columns={'track_id': 'true_train'})
                  .set_index('user_id'))

interactions_df['true_test'] = (
    ratings_df_test
    .reset_index()
    .groupby('user_id')['track_id'].agg(lambda x: list(x))
)

# заполнение пропусков пустыми списками
interactions_df.loc[pd.isnull(interactions_df.true_test), 'true_test'] = [
    [''] for x in range(len(interactions_df.loc[pd.isnull(interactions_df.true_test), 'true_test']))]

interactions_df.head(5)

Unnamed: 0_level_0,true_train,true_test
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,"[14307760, 29732892, 4348456, 8643011, 1463061...","[41314173, 16262974, 7026677, 23537059, 794305..."
9,"[18113974, 22363431, 43590126, 28203136, 75258...","[15904121, 19309434, 18676165, 38762205, 19028..."
65,"[19012147, 37710883, 37172107, 36039983, 27475...","[11196669, 40913219, 21115952, 12623509, 44061..."
76,"[13785944, 4386315, 20889724, 40109528, 234834...","[30368975, 33778999, 11431730, 42824783, 25696..."
110,"[23326061, 35051782, 12418169, 18526104, 44623...","[41411019, 28203136, 16844849, 12451427, 98502..."


In [36]:
top_k = 10

cf_preds_df_t = cf_preds_df.transpose()

predictions = []

for user_id in tqdm(interactions_df.index):
    prediction = (
        cf_preds_df_t
        .loc[user_id]
        .sort_values(ascending=False)
        .index.values
    )
    
    predictions.append(
        list(prediction[~np.in1d(
            prediction,
            interactions_df.loc[user_id, 'true_train'])])[:top_k])

interactions_df['prediction_svd'] = predictions

  0%|          | 0/1939 [00:00<?, ?it/s]

In [37]:
interactions_df

Unnamed: 0_level_0,true_train,true_test,prediction_svd
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,"[14307760, 29732892, 4348456, 8643011, 1463061...","[41314173, 16262974, 7026677, 23537059, 794305...","[32749441, 40660771, 40071241, 26445594, 40477..."
9,"[18113974, 22363431, 43590126, 28203136, 75258...","[15904121, 19309434, 18676165, 38762205, 19028...","[39478726, 12327833, 24361147, 45362359, 43044..."
65,"[19012147, 37710883, 37172107, 36039983, 27475...","[11196669, 40913219, 21115952, 12623509, 44061...","[22461110, 14323989, 43014490, 38045207, 17774..."
76,"[13785944, 4386315, 20889724, 40109528, 234834...","[30368975, 33778999, 11431730, 42824783, 25696...","[30441424, 22268461, 33458906, 14323989, 21320..."
110,"[23326061, 35051782, 12418169, 18526104, 44623...","[41411019, 28203136, 16844849, 12451427, 98502...","[32083560, 10290950, 26445594, 44219294, 23841..."
...,...,...,...
119268,"[44237210, 29321338, 7574307, 20555631, 438631...","[19512430, 5145627, 7085842, 16558683, 2796986...","[38944354, 17774819, 12001256, 29650643, 48139..."
119638,"[6860393, 36410186, 40011180, 13007884, 44777820]",[31448925],"[9793081, 24309553, 18009623, 36039983, 367043..."
119645,"[27534747, 23642726, 4489625, 21238394, 211731...","[28909250, 23664795, 12418169, 34702656, 18162...","[20562662, 28979867, 29675359, 10359387, 30387..."
119897,"[45138264, 8429958, 19606468, 4632002, 1782010...","[45781496, 15944840, 31965040]","[37857253, 15656770, 38183644, 37090419, 44973..."


In [38]:
def calc_precision_10(column):
    return (
        interactions_df
        .apply(
            lambda row:
            len(set(row['true_test']).intersection(
                set(row[column]))) / 10.0,
            axis=1)).mean()

In [39]:
def calc_recall_10(column):
    return (
        interactions_df
        .apply(
            lambda row:
            len(set(row['true_test']).intersection(
                set(row[column]))) / len(row['true_test']) + 0.001,
            axis=1)).mean()

In [40]:
calc_recall_10('prediction_svd')

0.06828695516921039

In [41]:
calc_precision_10('prediction_svd')

0.3382155750386797

## Сравнение с dask

In [52]:
count_df = full_df.drop(columns=['rating'])

In [53]:
dataset = Dataset()
%time
dataset.fit(count_df.user_id, count_df.track_id)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.3 µs


In [54]:
(interactions, weights) = dataset.build_interactions(count_df.itertuples(False, None))

In [55]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, Num items {}.'.format(num_users, num_items))

Num users: 1939, Num items 3000.


In [56]:
(train_interactions, test_interactions) = random_train_test_split(
    interactions,
    test_percentage=0.2,
    random_state=42,
)

(train_weights, test_weights) = random_train_test_split(
    weights,
    test_percentage=0.2,
    random_state=42,
)

In [57]:
model = LightFM(
    loss='warp',
    random_state=42,
)
model.fit(
    train_interactions,
    sample_weight=train_weights,
    epochs=10,
    num_threads=12,
    verbose=True,
)

Epoch: 100%|██████████| 10/10 [00:07<00:00,  1.35it/s]


<lightfm.lightfm.LightFM at 0x7f76ed73b310>

In [58]:
%%time
recall_at_k(
    model,
    test_interactions,
    train_interactions,
    k=10,
    num_threads=12,
).mean()

CPU times: user 1.54 s, sys: 2.64 ms, total: 1.55 s
Wall time: 1.02 s


0.057133911134356695

In [59]:
%%time
precision_at_k(
    model,
    test_interactions,
    train_interactions,
    k=10,
    num_threads=12,
).mean()

CPU times: user 1.54 s, sys: 4.84 ms, total: 1.54 s
Wall time: 867 ms


0.2954238