In [2]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
import random
from tqdm.notebook import tqdm
import json

np.random.seed = 42

In [3]:
listening_counts = pd.read_csv('listening-counts.tsv', sep='\t')
users = pd.read_csv('users.tsv', sep='\t')


In [4]:
users_sample = users[(users['age']!=-1) & (users['country'].notna())].sample(2000, random_state=42)
users_sample

Unnamed: 0,user_id,country,age,gender,creation_time
17019,17019,CZ,20,m,2007-12-04 14:23:32
2697,2697,ES,25,f,2005-10-16 18:32:47
52973,52973,RU,17,m,2010-08-25 12:57:40
30310,30310,PL,18,m,2009-02-11 14:08:23
24701,24701,US,35,m,2008-09-13 03:48:50
...,...,...,...,...,...
20608,20608,PL,24,m,2008-04-19 18:45:40
10772,10772,UA,22,m,2007-03-12 20:39:25
21294,21294,US,21,m,2008-05-17 02:40:50
19659,19659,DE,26,m,2008-03-13 03:31:27


In [7]:
counts_sample = listening_counts[listening_counts['user_id'].isin(users_sample['user_id'])]
counts_sample['rating'] = counts_sample['count'] / counts_sample.groupby('user_id')['count'].transform('max')
ratings_df = counts_sample[counts_sample['track_id'].isin(\
            list(counts_sample.groupby('track_id').sum()['rating'].sort_values(ascending=False)[:3000].index))]
ratings_df['tracks_count'] = ratings_df.groupby('user_id')['count'].transform('count')
ratings_df = ratings_df[ratings_df['tracks_count'] >= 5]
#ratings_df.drop(columns=['count', 'tracks_count'], inplace=True)
ratings_df.drop(columns=['tracks_count'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_sample['rating'] = counts_sample['count'] / counts_sample.groupby('user_id')['count'].transform('max')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['tracks_count'] = ratings_df.groupby('user_id')['count'].transform('count')


In [8]:
ratings_df

Unnamed: 0,user_id,track_id,count,rating
115169,44126,14685855,3,0.021127
115172,44126,24231732,4,0.028169
115173,44126,28832689,3,0.021127
115235,44126,10844010,1,0.007042
115237,44126,18052107,1,0.007042
...,...,...,...,...
518913850,76791,18551625,1,0.333333
518913853,76791,16929577,1,0.333333
518913854,76791,27546184,1,0.333333
518913857,76791,37945096,1,0.333333


In [9]:
ratings_df.to_csv('ratings_df.csv', index=False)

In [97]:
counts_sample.groupby('track_id').sum()['rating'].sort_values(ascending=False)

track_id
36346257    62.593538
33619193    55.544258
20926153    55.098906
32083560    53.709506
36039983    53.637374
              ...    
32453044     0.000060
17522471     0.000060
41137397     0.000060
24111365     0.000060
44736280     0.000060
Name: rating, Length: 4996467, dtype: float64

In [78]:
ratings_df_train, ratings_df_test = train_test_split(ratings_df,
                                   stratify=ratings_df['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(ratings_df_train))
print('# interactions on Test set: %d' % len(ratings_df_test))

# interactions on Train set: 499252
# interactions on Test set: 124813


In [79]:
ratings_df = ratings_df.set_index('user_id')
ratings_df_train = ratings_df_train.set_index('user_id')
ratings_df_test = ratings_df_test.set_index('user_id')

In [80]:
def get_items_listened(person_id, listened_df):
    # Get the user's data and merge in the movie information.
    listened_items = listened_df.loc[person_id]['track_id']
    return set(listened_items if type(listened_items) == pd.Series else [listened_items])

In [81]:
all_items = set(ratings_df['track_id'])

In [82]:
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100


class ModelEvaluator:

    def get_not_listened_items_sample(self, person_id, sample_size, seed=42):
        listened_items = get_items_listened(person_id, ratings_df)
        non_listened_items = all_items - listened_items

        random.seed(seed)
        non_listened_items_sample = random.sample(non_listened_items, sample_size)
        return set(non_listened_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):
        try:
            index = next(i for i, c in enumerate(recommended_items) if c == item_id)
        except:
            index = -1
        hit = int(index in range(0, topn))
        return hit, index

    def evaluate_model_for_user(self, model, person_id):
        # Getting the items in test set
        listened_values_testset = ratings_df_test.loc[person_id]
        if type(listened_values_testset['track_id']) == pd.Series:
            person_listened_items_testset = set(listened_values_testset['track_id'])
        else:
            person_listened_items_testset = set([int(listened_values_testset['track_id'])])
        listened_items_count_testset = len(person_listened_items_testset)

        # Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id,
                                               items_to_ignore=get_items_listened(person_id,
                                                                                    ratings_df_train),
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        # For each item the user has listened in test set
        for item_id in person_listened_items_testset:
            # Getting a random sample (100) items the user has not listened 
            # (to represent items that are assumed to be no relevant to the user)
            non_listened_items_sample = self.get_not_listened_items_sample(person_id,
                                                                               sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS,
                                                                               seed=item_id % (2 ** 32))

            # Combining the current listened item with the 100 random items
            items_to_filter_recs = non_listened_items_sample.union(set([item_id]))

            # Filtering only recommendations that are either the listened item or from a random sample of 100 non-listened items
            valid_recs_df = person_recs_df[person_recs_df['track_id'].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['track_id'].values
            # Verifying if the current listened item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        # Recall is the rate of the listened items that are ranked among the Top-N recommended items, 
        # when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(listened_items_count_testset)
        recall_at_10 = hits_at_10_count / float(listened_items_count_testset)

        person_metrics = {'hits@5_count': hits_at_5_count,
                          'hits@10_count': hits_at_10_count,
                          'listened_count': listened_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        # print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(tqdm(list(ratings_df_test.index.unique().values))):
            # if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)
            person_metrics['user_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
            .sort_values('listened_count', ascending=False)

        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(
            detailed_results_df['listened_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(
            detailed_results_df['listened_count'].sum())

        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}
        return global_metrics, detailed_results_df


model_evaluator = ModelEvaluator()    

In [83]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df):
        self.cf_predictions_df = cf_predictions_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['track_id'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        return recommendations_df
    

In [84]:
pivot_train = ratings_df.pivot_table(index='user_id', columns='track_id', values='rating').fillna(0)
csr_coll_matrix_train = csr_matrix(pivot_train)

In [85]:
NUMBER_OF_FACTORS_MF = 15

U, sigma, Vt = svds(csr_coll_matrix_train, k = NUMBER_OF_FACTORS_MF)

sigma = np.diag(sigma)

In [86]:
predicted_ratings_train = np.dot(np.dot(U, sigma), Vt) 
predicted_ratings_train_norm = (predicted_ratings_train - 
                                   predicted_ratings_train.min()) / (predicted_ratings_train.max()
                                                                        - predicted_ratings_train.min())

In [87]:
cf_preds_df = pd.DataFrame(predicted_ratings_train_norm,
                           columns = pivot_train.columns,
                           index=list(pivot_train.index)).transpose()

In [88]:
cf_recommender_model = CFRecommender(cf_preds_df)

In [89]:
model_evaluator = ModelEvaluator() 

In [90]:
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)

  0%|          | 0/1939 [00:00<?, ?it/s]

1938 users processed


In [91]:
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)


Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.33112736654034436, 'recall@10': 0.4711688686274667}


Unnamed: 0,hits@5_count,hits@10_count,listened_count,recall@5,recall@10,user_id
160,144,180,350,0.411429,0.514286,54443
511,131,178,343,0.381924,0.51895,52439
612,123,149,342,0.359649,0.435673,26625
732,109,145,319,0.341693,0.454545,16026
463,98,132,312,0.314103,0.423077,20234
906,71,110,307,0.23127,0.358306,16767
55,80,120,292,0.273973,0.410959,24549
74,85,132,289,0.294118,0.456747,6651
30,192,224,287,0.66899,0.780488,27263
407,80,117,287,0.278746,0.407666,10455


In [21]:
def dcg_at_k(r, k):
    """Score is discounted cumulative gain (dcg)
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    return np.sum(r / np.log2(np.arange(2, r.size + 2)))

def ndcg_at_k(r, k):
    """Score is normalized discounted cumulative gain (ndcg)
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [92]:
interactions_df = (ratings_df_train
                   .reset_index()
                   .groupby('user_id')['track_id']
                   .agg(lambda x: list(x)).reset_index()
                    .rename(columns={'track_id': 'true_train'})
                  .set_index('user_id'))

interactions_df['true_test'] = (
    ratings_df_test
    .reset_index()
    .groupby('user_id')['track_id'].agg(lambda x: list(x))
)

# заполнение пропусков пустыми списками
interactions_df.loc[pd.isnull(interactions_df.true_test), 'true_test'] = [
    [''] for x in range(len(interactions_df.loc[pd.isnull(interactions_df.true_test), 'true_test']))]

interactions_df.head(5)

Unnamed: 0_level_0,true_train,true_test
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,"[14307760, 29732892, 4348456, 8643011, 1463061...","[41314173, 16262974, 7026677, 23537059, 794305..."
9,"[18113974, 22363431, 43590126, 28203136, 75258...","[15904121, 19309434, 18676165, 38762205, 19028..."
65,"[19012147, 37710883, 37172107, 36039983, 27475...","[11196669, 40913219, 21115952, 12623509, 44061..."
76,"[13785944, 4386315, 20889724, 40109528, 234834...","[30368975, 33778999, 11431730, 42824783, 25696..."
110,"[23326061, 35051782, 12418169, 18526104, 44623...","[41411019, 28203136, 16844849, 12451427, 98502..."


In [93]:
top_k = 10

cf_preds_df_t = cf_preds_df.transpose()

predictions = []

for user_id in tqdm(interactions_df.index):
    prediction = (
        cf_preds_df_t
        .loc[user_id]
        .sort_values(ascending=False)
        .index.values
    )
    
    predictions.append(
        list(prediction[~np.in1d(
            prediction,
            interactions_df.loc[user_id, 'true_train'])])[:top_k])

interactions_df['prediction_svd'] = predictions

  0%|          | 0/1939 [00:00<?, ?it/s]

In [94]:
interactions_df

Unnamed: 0_level_0,true_train,true_test,prediction_svd
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,"[14307760, 29732892, 4348456, 8643011, 1463061...","[41314173, 16262974, 7026677, 23537059, 794305...","[32749441, 40660771, 40071241, 26445594, 40477..."
9,"[18113974, 22363431, 43590126, 28203136, 75258...","[15904121, 19309434, 18676165, 38762205, 19028...","[39478726, 12327833, 24361147, 45362359, 43044..."
65,"[19012147, 37710883, 37172107, 36039983, 27475...","[11196669, 40913219, 21115952, 12623509, 44061...","[22461110, 14323989, 43014490, 38045207, 17774..."
76,"[13785944, 4386315, 20889724, 40109528, 234834...","[30368975, 33778999, 11431730, 42824783, 25696...","[30441424, 22268461, 33458906, 14323989, 21320..."
110,"[23326061, 35051782, 12418169, 18526104, 44623...","[41411019, 28203136, 16844849, 12451427, 98502...","[32083560, 10290950, 26445594, 44219294, 23841..."
...,...,...,...
119268,"[44237210, 29321338, 7574307, 20555631, 438631...","[19512430, 5145627, 7085842, 16558683, 2796986...","[38944354, 17774819, 12001256, 29650643, 48139..."
119638,"[6860393, 36410186, 40011180, 13007884, 44777820]",[31448925],"[9793081, 24309553, 18009623, 36039983, 367043..."
119645,"[27534747, 23642726, 4489625, 21238394, 211731...","[28909250, 23664795, 12418169, 34702656, 18162...","[20562662, 28979867, 29675359, 10359387, 30387..."
119897,"[45138264, 8429958, 19606468, 4632002, 1782010...","[45781496, 15944840, 31965040]","[37857253, 15656770, 38183644, 37090419, 44973..."


In [18]:
def calc_precision_10(column):
    return (
        interactions_df
        .apply(
            lambda row:
            len(set(row['true_test']).intersection(
                set(row[column]))) / 10.0,
            axis=1)).mean()

In [19]:
def calc_recall_10(column):
    return (
        interactions_df
        .apply(
            lambda row:
            len(set(row['true_test']).intersection(
                set(row[column]))) / len(row['true_test']) + 0.001,
            axis=1)).mean()

In [95]:
calc_recall_10('prediction_svd')

0.06828695516921039

In [96]:
calc_precision_10('prediction_svd')

0.3382155750386797

In [65]:
with open('micro_genre.pkl', 'rb') as f:
    micro_genre = pickle.load(f)

In [66]:
len(micro_genre[micro_genre['i'].isin(ratings_df['track_id'])]['micro_genre'].unique())

660

In [68]:
micro_genre[micro_genre['i'].isin(ratings_df['track_id'])]

Unnamed: 0,artist,track,i,micro_genre,weight
0,Gotye,Somebody That I Used to Know,36346257,pop,19
1,Gotye,Somebody That I Used to Know,36346257,indie pop,10
2,Gotye,Somebody That I Used to Know,36346257,rock,5
3,Gotye,Somebody That I Used to Know,36346257,singer-songwriter,4
4,Gotye,Somebody That I Used to Know,36346257,indie rock,4
...,...,...,...,...,...
4983352,Alan Parsons Project,Eye In The Sky,14456757,easy listening,4
4983353,Alan Parsons Project,Eye In The Sky,14456757,pop rock,2
4983354,Alan Parsons Project,Eye In The Sky,14456757,album rock,2
4983355,Alan Parsons Project,Eye In The Sky,14456757,yacht rock,2


## Увеличиваем объем треков для выборки

In [10]:
counts_sample = listening_counts[listening_counts['user_id'].isin(users_sample['user_id'])]
counts_sample['rating'] = counts_sample['count'] / counts_sample.groupby('user_id')['count'].transform('max')
ratings_df = counts_sample[counts_sample['track_id'].isin(\
            list(counts_sample.groupby('track_id').sum()['rating'].sort_values(ascending=False)[:200000].index))]
ratings_df['tracks_count'] = ratings_df.groupby('user_id')['count'].transform('count')
ratings_df = ratings_df[ratings_df['tracks_count'] >= 5]
ratings_df.drop(columns=['count', 'tracks_count'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_sample['rating'] = counts_sample['count'] / counts_sample.groupby('user_id')['count'].transform('max')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['tracks_count'] = ratings_df.groupby('user_id')['count'].transform('count')


In [11]:
ratings_df_train, ratings_df_test = train_test_split(ratings_df,
                                   stratify=ratings_df['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(ratings_df_train))
print('# interactions on Test set: %d' % len(ratings_df_test))

# interactions on Train set: 4244624
# interactions on Test set: 1061156


In [12]:
ratings_df = ratings_df.set_index('user_id')
ratings_df_train = ratings_df_train.set_index('user_id')
ratings_df_test = ratings_df_test.set_index('user_id')

In [13]:
interactions_df = (ratings_df_train
                   .reset_index()
                   .groupby('user_id')['track_id']
                   .agg(lambda x: list(x)).reset_index()
                    .rename(columns={'track_id': 'true_train'})
                  .set_index('user_id'))

interactions_df['true_test'] = (
    ratings_df_test
    .reset_index()
    .groupby('user_id')['track_id'].agg(lambda x: list(x))
)

# заполнение пропусков пустыми списками
interactions_df.loc[pd.isnull(interactions_df.true_test), 'true_test'] = [
    [''] for x in range(len(interactions_df.loc[pd.isnull(interactions_df.true_test), 'true_test']))]

interactions_df.head(5)

Unnamed: 0_level_0,true_train,true_test
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,"[8733304, 44532922, 18433433, 37483146, 873821...","[39457658, 30610309, 17387182, 36840190, 19799..."
9,"[18214242, 18077323, 17673797, 21425276, 14887...","[18687564, 7849323, 40890797, 21088516, 457783..."
65,"[19453436, 42461011, 39797118, 35079236, 35284...","[14966565, 18674105, 20121435, 15836384, 15791..."
76,"[38141401, 40495238, 31841795, 16265596, 13879...","[9151975, 14464842, 31806064, 9633926, 3635144..."
110,"[39560985, 7061374, 36040233, 35935326, 117518...","[7505656, 3287935, 16875689, 5542036, 34316396..."


In [15]:
pivot_train = ratings_df.pivot_table(index='user_id', columns='track_id', values='rating').fillna(0)
csr_coll_matrix_train = csr_matrix(pivot_train)

NUMBER_OF_FACTORS_MF = 15
U, sigma, Vt = svds(csr_coll_matrix_train, k = NUMBER_OF_FACTORS_MF)
sigma = np.diag(sigma)

predicted_ratings_train = np.dot(np.dot(U, sigma), Vt) 
predicted_ratings_train_norm = (predicted_ratings_train - 
                                   predicted_ratings_train.min()) / (predicted_ratings_train.max()
                                                                        - predicted_ratings_train.min())

cf_preds_df = pd.DataFrame(predicted_ratings_train_norm,
                           columns = pivot_train.columns,
                           index=list(pivot_train.index))

In [16]:
top_k = 10

predictions = []

for user_id in tqdm(interactions_df.index):
    prediction = (
        cf_preds_df
        .loc[user_id]
        .sort_values(ascending=False)
        .index.values
    )
    
    predictions.append(
        list(prediction[~np.in1d(
            prediction,
            interactions_df.loc[user_id, 'true_train'])])[:top_k])

interactions_df['prediction_svd'] = predictions

  0%|          | 0/1998 [00:00<?, ?it/s]

In [20]:
calc_recall_10('prediction_svd')

0.006582834464293536

In [21]:
calc_precision_10('prediction_svd')

0.21161161161161163