# RecSys Competition: Data screening
## 2017/18
### Author: Moritz Meister
---------------------------

In [1]:
import pandas as pd
import numpy as np

### Import data:

train_final.csv - the training set of interactions

In [2]:
train_final = pd.read_csv('../input/train_final.csv', sep = '\t')
print(train_final.head())
print(train_final.shape)

   playlist_id  track_id
0      3271849   2801526
1      5616275    727878
2     11267488   2805283
3     10103900   1515105
4      3836898   2945623
(1040522, 2)


tracks_final.csv - supplementary information about the items

In [3]:
tracks_final = pd.read_csv('../input/tracks_final.csv', sep = '\t')
print(tracks_final.head())
print(tracks_final.shape)

   track_id  artist_id  duration  playcount   album  \
0   2972914        144    224000       49.0     [7]   
1   2750239        246    157000        1.0     [8]   
2   1550729        144    217000      554.0     [9]   
3   2169950        144    207000      200.0     [9]   
4   1903709        144    198000        5.0  [None]   

                                     tags  
0     [54087, 1757, 1718, 116712, 189631]  
1   [189631, 3424, 177424, 46208, 205245]  
2   [54087, 109806, 46869, 183258, 54337]  
3  [54087, 70618, 207003, 109806, 116712]  
4   [54087, 81223, 116712, 215342, 71028]  
(100000, 6)


target_tracks.csv - the set of target items (tracks) to be recommended

In [4]:
target_tracks = pd.read_csv('../input/target_tracks.csv', sep = '\t')
print(target_tracks.head())
print(target_tracks.shape)

   track_id
0   1316175
1   3885714
2   3091270
3    226759
4    230596
(32195, 1)


target_playlists.csv - the set of target playlists that will receive recommendations

In [5]:
target_playlists = pd.read_csv('../input/target_playlists.csv', sep = '\t')
print(target_playlists.head())
print(target_playlists.shape)

   playlist_id
0     10024884
1     10624787
2      4891851
3      4267369
4        65078
(10000, 1)


playlists_final.csv - supplementary information about the users

In [6]:
playlists_final = pd.read_csv('../input/playlists_final.csv', sep = '\t')
print(playlists_final.head())
print(playlists_final.shape)

   created_at  playlist_id                                             title  \
0  1216545588       644838                                           [12727]   
1  1249326867      7577564                                                []   
2  1257766688      3120683                                             [183]   
3  1248079275      4278112                [12389, 18698, 18925, 11695, 7117]   
4  1175201268      8656823  [12809, 2095, 13257, 12671, 20426, 14448, 18698]   

   numtracks  duration  owner  
0         27      6522  41504  
1          9      2650  41504  
2         16      3645  44542  
3         15      4151  44542  
4         84     18414  44542  
(57561, 6)


### Analysis of train_final:

Size of the DataFrame:

In [7]:
print(train_final.shape)

(1040522, 2)


Check for duplicates:

In [8]:
print(train_final.drop_duplicates().shape)

(1040522, 2)


-> No duplicates! Also means there is no Playlist with the same track twice in it.

Number of unique playlists:

In [9]:
print(train_final['playlist_id'].nunique())
print(train_final.shape[0]/train_final['playlist_id'].nunique())

45649
22.793971390391903


Each playlist contains 22.79 tracks on average.

Number of unique tracks in these playlists:

In [10]:
print(train_final['track_id'].nunique())

99999


That means a Playlist/Track frequency matrix would have 4.564.854.351 elements - of which only 1.040.522 are non-zero.

Count songs in each playlist:

In [11]:
pl_agg = pd.DataFrame(train_final.groupby('playlist_id').track_id.nunique())
pl_agg.reset_index(level=0, inplace=True)
pl_agg.columns = ['playlist_id', 'track_count']
print(pl_agg['track_count'].describe())

count    45649.000000
mean        22.793971
std         32.141616
min          1.000000
25%          4.000000
50%          9.000000
75%         26.000000
max        199.000000
Name: track_count, dtype: float64


Find distribution of playlist size in terms of tracks in it:

In [12]:
plsize_dist = pd.DataFrame(pl_agg['track_count'].value_counts())
plsize_dist.reset_index(level=0, inplace=True)
plsize_dist.columns = ['pl_size', 'number_pl']
print(plsize_dist)

     pl_size  number_pl
0          2       3551
1          1       3185
2          3       2974
3          5       2761
4          6       2472
5          4       2395
6          7       2224
7          8       2044
8          9       1940
9         10       1218
10        11       1089
11        12        988
12        13        885
13        14        801
14        15        767
15        16        669
16        17        607
17        18        529
18        19        499
19        20        479
20        21        456
21        22        400
22        23        386
23        25        362
24        24        339
25        26        321
26        28        313
27        30        286
28        27        285
29        29        280
..       ...        ...
167      171         13
168      166         12
169      177         12
170      172         11
171      164         11
172      170         10
173      167         10
174      162          9
175      180          8
176      183    

Find the most popular tracks in terms of number of playlists they are in:

In [17]:
tr_agg = pd.DataFrame(train_final.groupby('track_id').playlist_id.nunique())
tr_agg.reset_index(level=0, inplace=True)
tr_agg.columns = ['track_id', 'number_pl']
tr_agg = tr_agg.sort_values('number_pl',ascending=False)
print(tr_agg[:20])
print(tr_agg.shape)

       track_id  number_pl
38111   1563309        476
33725   1363985        432
94748   3705881        425
38860   1595978        403
82528   3166665        391
96651   3779477        390
5020     204966        387
73719   2863395        370
38517   1580480        349
28189   1156143        346
32541   1321053        339
5167     209196        330
17830    675104        324
97203   3796108        317
6847     276186        315
25754   1074579        307
31886   1286763        306
92927   3628787        306
58901   2339150        306
36483   1495432        304
(99999, 2)


### Analysis of target_tracks

Tracks to be recommended.

Check size of DataFrame and check for duplicates:

In [18]:
print(target_tracks.head())
print(target_tracks.shape)
print(target_tracks.drop_duplicates().shape)

   track_id
0   1316175
1   3885714
2   3091270
3    226759
4    230596
(32195, 1)
(32195, 1)


Check if all target tracks are in at least one playlist:

In [56]:
target_tracks['dummy'] = 1
print(target_tracks.head())

   track_id  dummy
0   1316175      1
1   3885714      1
2   3091270      1
3    226759      1
4    230596      1


In [55]:
tr_agg_target = target_tracks.merge(tr_agg, how='inner', on='track_id')
print(tr_agg_target.shape)

(32194, 3)


This means one of the target songs is in no playlist, probably just need to remove it.

### Analysis of target_playlists:

Check size of DataFrame and check for duplicates:

In [57]:
print(target_playlists.head())
print(target_playlists.shape)
print(target_playlists.drop_duplicates().shape)

   playlist_id
0     10024884
1     10624787
2      4891851
3      4267369
4        65078
(10000, 1)
(10000, 1)


In [61]:
target_playlists['dummy'] = 1
print(target_playlists.head())

   playlist_id  dummy
0     10024884      1
1     10624787      1
2      4891851      1
3      4267369      1
4        65078      1


Check if target playlists are contained in the training playlists:

In [62]:
pl_agg_target = target_playlists.merge(pl_agg, how='inner', on='playlist_id')
print(pl_agg_target.shape)
print(pl_agg_target.head())

(10000, 3)
   playlist_id  dummy  track_count
0     10024884      1           66
1     10624787      1           38
2      4891851      1           24
3      4267369      1           80
4        65078      1           91


All target playlists are contained in the training set.