In [1]:
import tqdm
import json
import glob

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$,
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные**: заполним средним времнем прослушивания по пользователю

**Соседи**: в качестве соседей будем рассматривать всех пользователей. Q: Как это упростит формулу?

$$\hat r_{ui} \propto \sum_{v} w_{uv} h(r_{vi})$$

In [2]:
data = pd.concat([
    pd.read_json(data_path, lines=True) 
    for data_path 
    in glob.glob("/Users/o.saprykin/Desktop/data/*/data.json")
])
data["rnd"] = np.random.random(len(data))

data.head(5)

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd
0,next,2025-03-05 15:37:08.180,1112,1129,1.0,0.040102,3161.0,{'STICKY_ARTIST': 'T1'},0.527541
1,next,2025-03-05 15:37:08.199,6224,44172,1.0,0.005623,36794.0,{'STICKY_ARTIST': 'T1'},0.667874
2,next,2025-03-05 15:37:08.203,3873,45369,1.0,0.00161,41696.0,{'STICKY_ARTIST': 'T8'},0.192139
3,next,2025-03-05 15:37:08.224,6362,26315,0.63,0.001398,26315.0,{'STICKY_ARTIST': 'T1'},0.433839
4,next,2025-03-05 15:37:08.233,3873,41696,0.01,0.001908,47234.0,{'STICKY_ARTIST': 'T8'},0.67312


In [3]:
data["normalized_time"] = data.groupby("user")["time"].transform(lambda time: time - time.mean())

data.head()

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd,normalized_time
0,next,2025-03-05 15:37:08.180,1112,1129,1.0,0.040102,3161.0,{'STICKY_ARTIST': 'T1'},0.527541,0.601739
1,next,2025-03-05 15:37:08.199,6224,44172,1.0,0.005623,36794.0,{'STICKY_ARTIST': 'T1'},0.667874,0.628088
2,next,2025-03-05 15:37:08.203,3873,45369,1.0,0.00161,41696.0,{'STICKY_ARTIST': 'T8'},0.192139,0.594828
3,next,2025-03-05 15:37:08.224,6362,26315,0.63,0.001398,26315.0,{'STICKY_ARTIST': 'T1'},0.433839,0.268889
4,next,2025-03-05 15:37:08.233,3873,41696,0.01,0.001908,47234.0,{'STICKY_ARTIST': 'T8'},0.67312,-0.395172


In [4]:
interactions = pd.pivot_table(data, values="normalized_time", index="user", columns="track").fillna(0)#.reset_index()

print(f"Interactions matrix: shape={interactions.shape}, density={(interactions != 0).values.sum() / interactions.size}")

Interactions matrix: shape=(9814, 44472), density=0.0004973579820494989


In [7]:
interactions.head(5)

track,0,1,2,4,5,6,7,8,9,10,...,49987,49988,49990,49991,49992,49994,49995,49996,49997,49999
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
similarity_matrix = cosine_similarity(interactions)
np.fill_diagonal(similarity_matrix, 0)

print(f"Mean positive neighbours per user: {(similarity_matrix > 0).sum(axis=1).mean()}")

Mean positive neighbours per user: 32.89993886284899


In [9]:
print(f"Mean negative neighbours per user: {(similarity_matrix < 0).sum(axis=1).mean()}")

Mean negative neighbours per user: 27.762991644589363


In [10]:
# expected size: observed users x observed tracks
scores_matrix = np.matmul(similarity_matrix, interactions.values)

scores = pd.DataFrame(
    scores_matrix,
    index=interactions.index,
    columns=interactions.columns
)

scores[[0, 1, 2, 5, 6]].head(50)

track,0,1,2,5,6
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0


## Глянем на рекомендации

In [11]:
BOTIFY_DATA_DIR = "/Users/o.saprykin/VK/Препод/ITMO/recsys-course-spring-2025/botify/data/"

In [12]:
products = pd.read_json(BOTIFY_DATA_DIR + "tracks.json", lines=True).set_index("track")
products.head()

Unnamed: 0_level_0,artist,album,title,genre,pop,duration
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
41164,Михаил Бублик,ART-Обстрел I-часть,Сорок тысяч верст,"[1, 47]",-0.500252,282
27544,Xamdam Sobirov,Baxtli Bo'lolmadik,Baxtli Bo'lolmadik,[1],-0.942953,205
34702,Сергей Какенов,Ишимская шпана,Крутые лагеря,[147],-0.801382,252
45907,Loc-Dog,Electrodog 2,Еду убивать,[17],-0.577525,276
14978,Gafur,Февраль,Февраль,[1],-0.738636,160


In [13]:
user = np.random.choice(scores.index)
k = 30

data[data["user"] == user]

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd,normalized_time
5198,next,2025-03-05 15:37:14.161,9207,12859,1.0,0.000346,150.0,{'STICKY_ARTIST': 'T7'},0.43432,0.622609
5202,next,2025-03-05 15:37:14.164,9207,150,0.79,0.000228,150.0,{'STICKY_ARTIST': 'T7'},0.229415,0.412609
5228,next,2025-03-05 15:37:14.184,9207,10441,0.17,0.000383,22315.0,{'STICKY_ARTIST': 'T7'},0.650997,-0.207391
23892,next,2025-03-05 15:37:32.229,9207,22606,0.8,0.000262,22599.0,{'STICKY_ARTIST': 'T7'},0.245738,0.422609
23900,next,2025-03-05 15:37:32.235,9207,22597,0.51,0.000344,22604.0,{'STICKY_ARTIST': 'T7'},0.771179,0.132609
23908,next,2025-03-05 15:37:32.244,9207,22611,0.33,0.00059,22594.0,{'STICKY_ARTIST': 'T7'},0.87644,-0.047391
23912,last,2025-03-05 15:37:32.249,9207,22594,0.26,5.1e-05,,{'STICKY_ARTIST': 'T7'},0.49081,-0.117391
70045,next,2025-03-05 15:38:42.047,9207,17786,1.0,0.000234,28793.0,{'STICKY_ARTIST': 'T7'},0.575297,0.622609
70052,next,2025-03-05 15:38:42.057,9207,30005,0.63,0.000456,6108.0,{'STICKY_ARTIST': 'T7'},0.49018,0.252609
70056,next,2025-03-05 15:38:42.064,9207,6108,0.51,0.000701,30361.0,{'STICKY_ARTIST': 'T7'},0.640908,0.132609


In [14]:
user_scores = pd.merge(
    scores.loc[user].sort_values(ascending=False)[:k].to_frame("score"),
    products, 
    left_index=True, 
    right_index=True,
    how="inner"
)

user_scores

Unnamed: 0_level_0,score,artist,album,title,genre,pop,duration
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
22606,0.291832,детские песни,Детские Песни И Песни Для Детей,Спят Усталые Игрушки,[1],-0.517865,124
22600,0.237202,детские песни,Детские Песни И Песни Для Детей,Когда Мои Друзья Со Мной,[1],-0.536678,206
17786,0.200242,Ramil',Katana,Морфий,[1],-0.793441,166
12859,0.157832,Ramil',Увидимся,Увидимся,[1],-0.271994,108
28795,0.113311,Ramil',"Всё, что есть у меня - это голод","Алё, родной",[10],-0.815097,149
23970,0.110813,Janob Rasul,Sog’indingmi,Sog'indingmi,[1],-0.261945,213
28875,0.105753,Zivert,Beverly Hills,Beverly Hills,[1],2.819097,219
22604,0.10493,детские песни,Детские Песни И Песни Для Детей,Чунга / Чанга,[1],-0.584876,90
22611,0.096275,детские песни,Детские Песни И Песни Для Детей,В Траве Сидел Кузнечик,[1],-0.520589,94
22596,0.09583,детские песни,Детские Песни И Песни Для Детей,Колыбельная Медведицы,[1],-0.629627,152


In [15]:
user_interactions = pd.merge(
    interactions.loc[user].sort_values(ascending=False).to_frame("time"),
    products, 
    left_index=True, 
    right_index=True, 
    how="inner"
)

user_interactions[user_interactions["time"] != 0]

Unnamed: 0_level_0,time,artist,album,title,genre,pop,duration
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12859,0.622609,Ramil',Увидимся,Увидимся,[1],-0.271994,108
31214,0.622609,Zivert,Life (Remix Collection),Life (Shnaps & Jay Filler Remix),[1],0.3226,162
17786,0.622609,Ramil',Katana,Морфий,[1],-0.793441,166
22606,0.422609,детские песни,Детские Песни И Песни Для Детей,Спят Усталые Игрушки,[1],-0.517865,124
22599,0.262609,детские песни,Детские Песни И Песни Для Детей,Песенка Крокодила Гены,[1],-0.930861,84
30978,0.132609,Ramil',Пускай по венам соль,Пускай по венам соль,[1],-0.772885,184
22597,0.132609,детские песни,Детские Песни И Песни Для Детей,Улыбка,[1],-0.618506,95
22600,0.132609,детские песни,Детские Песни И Песни Для Детей,Когда Мои Друзья Со Мной,[1],-0.536678,206
28793,0.127609,Ramil',"Всё, что есть у меня - это голод",Не был,[10],-0.963272,181
22610,0.122609,детские песни,Детские Песни И Песни Для Детей,Облака / Белогривые Лошадки,[1],-0.588928,124


## Подготавливаем рекомендации для продакшена

In [16]:
def recommend(user_id, scores, k):
    return scores.loc[user_id].sort_values(ascending=False)[:k].index.tolist()

In [17]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 100)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|██████████████████████████████████████| 9814/9814 [00:12<00:00, 793.48it/s]
