In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
data_path = Path('./outputs')

In [3]:
track_info = pd.read_csv(data_path/'track_info.csv',index_col=0)

In [4]:
track_info.head()

Unnamed: 0,TRACK_ID,noDownloads,noPurchases,noLikes
0,2828445,92.0,71.0,8.0
1,2828446,92.0,63.0,13.0
2,2828447,92.0,58.0,7.0
3,2828448,92.0,56.0,8.0
4,2828449,92.0,62.0,6.0


In [5]:
track_like_df = pd.read_csv(data_path/'track_like_expanded.csv')
track_like_df = track_like_df[['USER_ID', 'TRACK_ID']].assign(r=3)
track_like_df.columns = ['user', 'track', 'score']

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
track_download_df = pd.read_csv(data_path/'track_download_expanded.csv')
track_download_df = track_download_df[['USER_ID', 'TRACK_ID']].assign(r=1)
track_download_df.columns = ['user', 'track', 'score']

In [7]:
track_purchase_df = pd.read_csv(data_path/'track_purchase_expanded.csv')
track_purchase_df = track_purchase_df[['USER_ID','TRACK_ID']].assign(r=2)
track_purchase_df.columns = ['user','track','score']

In [8]:
total = pd.concat([track_download_df, track_like_df, track_purchase_df])
total = total.groupby(['user', 'track']).score.sum().reset_index()
total.shape

(38402540, 3)

In [9]:
track_tag = pd.read_csv(data_path/'track_tag_df.csv')
track_info_v2 = pd.read_csv(data_path/'track_infov2_df.csv')

In [10]:
total = total.loc[total['track'].isin(track_tag['TRACK_ID'].unique())]
total = total.loc[total['track'].isin(track_info_v2['TRACK_ID'].unique())]
total.shape

(38288211, 3)

In [11]:
total.describe()

Unnamed: 0,user,track,score
count,38288210.0,38288210.0,38288210.0
mean,162200100.0,160675400.0,1.901292
std,204855400.0,207159800.0,1.09386
min,1369.0,2828445.0,1.0
25%,8019590.0,6069897.0,1.0
50%,40383970.0,30864450.0,1.0
75%,344816700.0,395375400.0,3.0
max,557797400.0,557606800.0,6.0


In [12]:
no_tracks = 1000
track_info['total_interactions'] = track_info.apply(lambda x: x['noDownloads']+x['noPurchases']+x['noLikes'], axis = 1)
track_info = track_info.sort_values('total_interactions',ascending=False)
target_tracks = track_info[:no_tracks]['TRACK_ID'].values
total_sample = total.loc[total['track'].isin(target_tracks)]
print(total_sample.shape)
total_sample.head()

(15398563, 3)


Unnamed: 0,user,track,score
0,1369,555189215.0,2
1,1369,555330991.0,2
2,1369,555361362.0,2
3,1369,555361430.0,2
4,1369,555361484.0,2


In [13]:
total_sample = total_sample.sort_values('user',ascending=False)
from collections import Counter
counter = Counter(total_sample['user'])
target_users = {k: v for k, v in sorted(counter.items(), key=lambda item: item[1], reverse=True)}
target_users

{9310301: 963,
 10124886: 956,
 3605896: 951,
 3568922: 945,
 7033939: 925,
 3602718: 890,
 3613952: 888,
 3582874: 875,
 3581842: 864,
 6617914: 861,
 56382804: 859,
 3607134: 859,
 26681626: 857,
 3568924: 854,
 3608228: 847,
 10380769: 844,
 3605890: 841,
 3568920: 839,
 9955306: 834,
 8430962: 832,
 6448584: 829,
 7942848: 828,
 3605322: 828,
 3582502: 825,
 49583601: 823,
 10076893: 822,
 222759474: 817,
 9104160: 816,
 3593458: 815,
 46923553: 813,
 41270036: 813,
 7326680: 813,
 36181264: 812,
 7540915: 812,
 7480484: 811,
 3606906: 810,
 16825644: 809,
 3603424: 809,
 68849385: 807,
 3603244: 806,
 7177335: 805,
 10022326: 803,
 3614950: 802,
 9157114: 801,
 3614756: 801,
 10646396: 799,
 27731738: 796,
 3613202: 794,
 6876705: 793,
 3594456: 793,
 7650542: 789,
 3594846: 789,
 9683437: 787,
 41241995: 782,
 3602324: 782,
 34711967: 780,
 239010771: 778,
 6463418: 778,
 7163701: 776,
 6192635: 775,
 11854549: 773,
 7359164: 773,
 3568998: 773,
 42417003: 770,
 3604126: 770,
 30

In [16]:
# threshold = no_tracks
threshold = 750
values = np.array(list(target_users.values()))
values[values>=threshold].shape

(85,)

In [17]:
no_users = 85
top_n_users = list(target_users.keys())[:no_users]

In [18]:
gp = total_sample.groupby('user')
sample = pd.DataFrame()
for user in top_n_users:
    sample = pd.concat([sample,gp.get_group(user)])
sample.shape

(68700, 3)

In [19]:
sample['track'].nunique()

1000

In [20]:
sample['score'].value_counts()

3    32469
1    31801
6     2385
4     2040
2        5
Name: score, dtype: int64

In [21]:
sample.describe()

Unnamed: 0,user,track,score
count,68700.0,68700.0,68700.0
mean,23147860.0,134004500.0,2.207977
std,50665800.0,183931800.0,1.251607
min,3568920.0,2832434.0,1.0
25%,3605890.0,6652776.0,1.0
50%,7177335.0,26580130.0,3.0
75%,11854550.0,249704800.0,3.0
max,339744900.0,555361700.0,6.0


In [22]:
sample.to_csv(data_path/'total_sample.csv')