In [1]:
from matplotlib import pyplot as plt
import pickle
import torch
import tqdm
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from optuna.exceptions import ExperimentalWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ExperimentalWarning)

## Data reading

### clickstream

Defines listening events

In [25]:
from pathlib import Path
data_path = Path.home().joinpath('data/sberzvuk/ailab_data/')

clickstream = pd.read_csv(data_path.joinpath('clickstream.csv'), nrows=1_000_000)
clickstream.shape

(1000000, 6)

In [26]:
clickstream.columns

Index(['user_id', 'track_id', 'event_nm', 'action_nm',
       'playevent_play_duration', 'proc_dt'],
      dtype='object')

In [12]:
clickstream.sample(5)

Unnamed: 0,user_id,track_id,event_nm,action_nm,playevent_play_duration,proc_dt
134240,b2915447835a49d228f59558ef5747f9,62845175,playevent,,209,2022-05-03
73373,04cc3494d5371fba027b9e7866c2e269,116848845,playevent,,3,2022-05-02
29680,c6c35633e66b7b8731adcf751317e11b,60332069,playevent,,17,2022-05-04
925696,d3004b7054c90ff574d8acb80b479a14,108506423,playevent,,8,2022-05-01
728742,112c633b1cd81f4d628f17d8abb7f0b7,88836238,playevent,,126,2022-04-29


In [13]:
clickstream.rename(columns={'playevent_play_duration': 'dur_sec', 'proc_dt': 'event_dt'}, inplace=True)

In [15]:
clickstream.dur_sec.describe()

count    1000000.000000
mean         100.188872
std          103.286989
min           -1.000000
25%            3.000000
50%           71.000000
75%          187.000000
max         7365.000000
Name: dur_sec, dtype: float64

In [20]:
clickstream['event_nm'].unique()

array(['playevent', 'content_action_event'], dtype=object)

### user-track-score

Defines user-to-track scores

In [17]:
user_track_score = pd.read_csv(data_path.joinpath('user-track-score.csv'), nrows=1_000_000)
user_track_score.shape

(1000000, 3)

In [27]:
user_track_score.columns

Index(['user_id', 'track_id', 'score'], dtype='object')

In [18]:
user_track_score.sample(5)

Unnamed: 0,user_id,track_id,score
649559,d507dfe5e9cff144bf03b50ae4fa86d1,66902640,0.10357
542274,cf701af6bab247f9f5698adc81e9827e,54460671,0.060316
513698,dff8ec266fd40af995b3cb77e0114d10,33953267,0.141988
122364,47fe373732d6b689f3c8922d37496519,78181826,-0.016377
622782,c16739cd73d7aab8315c52a4099cef4e,91200837,0.509125


In [21]:
user_track_score.score.describe()

count    1000000.000000
mean           0.305944
std            0.241200
min           -0.371067
25%            0.116889
50%            0.254101
75%            0.447083
max            1.524466
Name: score, dtype: float64

### track-content-embedding

Defines embeddings for tracks

In [30]:
track_content_embedding = pd.read_csv(data_path.joinpath('track-content-embedding.csv'), delimiter='|') #, nrows=1_000_000)
track_content_embedding.shape

(929571, 179)

In [33]:
track_content_embedding.columns

Index(['track_id', 'rock', 'pop', 'alternative', 'indie', 'electronic',
       'female vocalists', 'dance', '00s', 'alternative rock',
       ...
       'vggish_119', 'vggish_120', 'vggish_121', 'vggish_122', 'vggish_123',
       'vggish_124', 'vggish_125', 'vggish_126', 'vggish_127', 'vggish_128'],
      dtype='object', length=179)

In [34]:
track_content_embedding.sample(5)

Unnamed: 0,track_id,rock,pop,alternative,indie,electronic,female vocalists,dance,00s,alternative rock,...,vggish_119,vggish_120,vggish_121,vggish_122,vggish_123,vggish_124,vggish_125,vggish_126,vggish_127,vggish_128
536969,66005393,0.071554,0.306421,0.048542,0.0541,0.062061,0.325613,0.040742,0.021728,0.011157,...,0.543726,-0.20868,-0.185012,-0.134909,-0.438505,0.124369,0.138185,-0.637147,-0.017909,-0.003953
378883,45460100,0.036997,0.086162,0.024377,0.037385,0.42634,0.043818,0.300892,0.014645,0.006188,...,-0.018932,-0.024307,-0.110146,-0.035656,-0.343013,-0.05993,-0.018535,-0.596535,0.199394,-0.041864
167590,40281676,0.047446,0.318348,0.03744,0.046209,0.241585,0.165512,0.303018,0.021383,0.007285,...,0.365033,-0.148051,-0.103554,0.055882,-0.386947,0.043183,-0.092979,-0.699584,0.015895,-0.034829
536080,22485165,0.046817,0.218019,0.019299,0.020978,0.217206,0.088055,0.480829,0.011503,0.005262,...,0.111647,0.004004,-0.062023,0.046582,-0.394918,-0.043548,-0.041702,-0.630187,0.071891,-0.052904
641948,44134329,0.440288,0.061867,0.152635,0.115335,0.02061,0.040031,0.009992,0.025221,0.122237,...,0.58913,-0.188643,-0.302071,-0.186909,-0.365565,0.18842,-0.189164,-0.624328,0.219976,0.321445


### user-genre-embedding

Defines embeddings for users

In [23]:
user_genre_embedding = pd.read_csv(data_path.joinpath('user-genre-embedding.csv')) #, nrows=1_000_000)
user_genre_embedding.shape

(2662604, 66)

In [36]:
user_genre_embedding.columns

Index(['user_id', 'genre_1_cnt', 'genre_2_cnt', 'genre_3_cnt', 'genre_4_cnt',
       'genre_5_cnt', 'genre_6_cnt', 'genre_7_cnt', 'genre_8_cnt',
       'genre_9_cnt', 'genre_10_cnt', 'genre_11_cnt', 'genre_12_cnt',
       'genre_13_cnt', 'genre_14_cnt', 'genre_15_cnt', 'genre_16_cnt',
       'genre_17_cnt', 'genre_18_cnt', 'genre_19_cnt', 'genre_20_cnt',
       'genre_21_cnt', 'genre_22_cnt', 'genre_23_cnt', 'genre_24_cnt',
       'genre_25_cnt', 'genre_26_cnt', 'genre_27_cnt', 'genre_28_cnt',
       'genre_29_cnt', 'genre_30_cnt', 'genre_31_cnt', 'genre_32_cnt',
       'genre_33_cnt', 'genre_34_cnt', 'genre_35_cnt', 'genre_36_cnt',
       'genre_37_cnt', 'genre_38_cnt', 'genre_39_cnt', 'genre_40_cnt',
       'genre_41_cnt', 'genre_42_cnt', 'genre_43_cnt', 'genre_44_cnt',
       'genre_45_cnt', 'genre_46_cnt', 'genre_47_cnt', 'genre_48_cnt',
       'genre_49_cnt', 'genre_50_cnt', 'genre_51_cnt', 'genre_52_cnt',
       'genre_53_cnt', 'genre_54_cnt', 'genre_55_cnt', 'genre_56_cnt',
    

In [35]:
user_genre_embedding.sample(5)

Unnamed: 0,user_id,genre_1_cnt,genre_2_cnt,genre_3_cnt,genre_4_cnt,genre_5_cnt,genre_6_cnt,genre_7_cnt,genre_8_cnt,genre_9_cnt,...,genre_56_cnt,genre_57_cnt,genre_58_cnt,genre_59_cnt,genre_60_cnt,genre_61_cnt,genre_62_cnt,genre_63_cnt,genre_64_cnt,genre_65_cnt
1269919,6ff52026460cb8de54751f5f38e4832c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
910296,cd2990c897e43d268bcb8437a9b6beb0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1780709,56346f101ab0d7d0e14d0f43e7d66a22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
727129,0b91f9b2981d01c5631a6be05788e3cd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
718565,20ce2166300bc09ef9e2646f858a291f,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
ailab = pd.read_csv(data_path.joinpath('ailab.csv'), nrows=1_000_000)
ailab.shape

(1000000, 7)

In [43]:
ailab.columns

Index(['user_id', 'packet_date', 'session_id', 'track_id', 'src_id',
       'event_name', 'playevent_play_duration'],
      dtype='object')

In [44]:
ailab.sample(5)

Unnamed: 0,user_id,packet_date,session_id,track_id,src_id,event_name,playevent_play_duration
280019,219F86D05C76364E1B212E85F66A4F0C,2022-04-12 14:38:35,2109757323,73253778,73253778,playevent,205
982791,C071C458AD9CB8B4235C031E77281A62,2022-04-17 22:34:56,-253572135,119584455,6498967,playevent,0
598716,AF3A7A766BFDECC636E6545D424421E8,2022-04-20 00:17:42,736824604,73226141,1062105,playevent,191
28432,5B91A90D5C2BFD6029159F2276A524C7,2022-04-26 22:07:23,-1581502475,114283196,1062105,playevent,1
992106,BA21D52CF37A902024E12782C4A87406,2022-04-27 06:03:33,-2058555095,87962394,6936835,playevent,190
