In [84]:
import os
import re
import json
import collections
import heapq
import random

random.seed(1)

In [2]:
def load_testing_data(file):
    with open(file) as f:
        data = json.load(f)
    return data

In [62]:
K = 3

In [61]:
testing_data = load_testing_data("dataset/spotify_challenge/challenge_set.json")

In [60]:
def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@?\[\]]", " ", name)
    name = re.sub(r"\s+", "", name).strip()
    return name

In [63]:
for x in testing_data["playlists"]:
    if "name" in x:
        x["name"] = normalize_name(x["name"])

In [64]:
task_to_playlists = {}
pid_to_playlist = {}
playlist_to_num_samples = {}
pid_to_set_tracks = {}
name_to_pids = {}
for p in testing_data["playlists"]:
    
    if "name" in p:
        if not name_to_pids.get(p["name"]):
            name_to_pids[p["name"]] = []
        name_to_pids[p["name"]].append(p["pid"])
    
    pid_to_playlist[p["pid"]] = p
    playlist_to_num_samples[p["pid"]] = p["num_samples"]
    pid_to_set_tracks[p["pid"]] = set([t["track_uri"] for t in p["tracks"]])
    for t in p["tracks"]:
        if not task_to_playlists.get(t["track_uri"]):
            task_to_playlists[t["track_uri"]] = []
        task_to_playlists[t["track_uri"]].append(p["pid"])

In [67]:
def process_mpd(path):
    filenames = os.listdir(path)
    count = 0
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            with open(fullpath) as f:
                js = f.read()
                mpd_slice = json.loads(js)
                for playlist in mpd_slice["playlists"]:
                    count += 1
                    if count % 1000 == 0:
                        print(count)
                    process_playlist(playlist)


In [68]:
def get_ids_as_set(p):
    return set([t["track_uri"] for t in p["tracks"]])

def calc_dist(ids1, ids2):
    count_common = len(ids1.intersection(ids2))
    return count_common / (len(ids1) + len(ids2) - count_common)

In [69]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [70]:
knn = {p["pid"]: [] for p in testing_data["playlists"]}
pot_recs_count = collections.Counter()

In [71]:
from dataclasses import dataclass, field

@dataclass(order=True)
class Neighbour(dict):
    dist: float
    pid: str=field(compare=False)
    unique: int=field(compare=False)
        
    def __init__(self, dist, pid, unique):
        self.dist = dist
        self.pid = pid
        self.unique = unique
        super().__init__(self, dist=dist, pid=pid, unique=unique)

In [72]:
def dist_nums(x, y, common):
    return common / (x + y - common)

def process_playlist(playlist):
    nn_playlists = flatten(
        [task_to_playlists.get(t["track_uri"], []) for t in playlist["tracks"]]
    ) + name_to_pids.get(playlist["name"], [])
    aggr = {}
    for test_p_id in nn_playlists:
        aggr[test_p_id] = aggr.get(test_p_id, 0) + 1
    
    p_num_tracks = playlist["num_tracks"]
    for pid, count in aggr.items():
        dist = dist_nums(playlist_to_num_samples[pid], p_num_tracks, count)
        ns = knn[pid]
        unique_count = p_num_tracks - count
        if pot_recs_count.get(pid, 0) < K * 500:
            heapq.heappush(ns, Neighbour(dist, playlist["pid"], unique_count))
            pot_recs_count[pid] += unique_count
        elif ns[0].dist < dist:
            pot_recs_count[pid] += unique_count - ns[0].unique
            heapq.heappop(ns)
            heapq.heappush(ns, Neighbour(dist, playlist["pid"], unique_count))

In [73]:
process_mpd("dataset/spotify/data")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
15

In [74]:
print(len(pot_recs_count))
for x, count in pot_recs_count.most_common(10000):
    print(count)

9734
5617
4744
4100
3983
3954
3667
3590
3545
3407
3382
3331
3327
3321
3299
3186
3169
3150
3125
3116
3106
3089
3078
3076
3066
3062
3037
3036
3014
2990
2978
2977
2976
2970
2963
2953
2946
2944
2941
2938
2926
2924
2910
2910
2902
2882
2877
2877
2868
2868
2852
2851
2849
2848
2847
2846
2844
2839
2832
2826
2822
2816
2814
2812
2810
2805
2803
2803
2801
2797
2796
2793
2793
2790
2771
2770
2768
2766
2765
2753
2753
2752
2748
2748
2738
2735
2720
2710
2702
2698
2694
2687
2686
2686
2685
2684
2684
2684
2673
2667
2665
2658
2652
2650
2649
2647
2641
2641
2641
2640
2639
2638
2637
2633
2631
2631
2626
2622
2620
2620
2614
2613
2612
2610
2609
2608
2605
2603
2601
2601
2596
2595
2595
2593
2592
2588
2578
2576
2575
2574
2572
2571
2569
2567
2563
2563
2559
2556
2555
2550
2546
2545
2541
2540
2540
2537
2537
2532
2531
2530
2529
2528
2524
2523
2523
2521
2514
2509
2509
2507
2505
2504
2504
2501
2498
2497
2497
2493
2493
2487
2487
2484
2482
2481
2481
2480
2479
2479
2479
2477
2476
2475
2475
2469
2468
2467
2466
2466
2466
2466


1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1557
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1556
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1555
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1554
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1553
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552


1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1503
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502
1502


In [75]:
with open("knn.json", "w") as f:
    json.dump(knn, f)

In [76]:
neighbour_to_playlist = {}
for pid, neighbours in knn.items():
    for n in neighbours:
        if not neighbour_to_playlist.get(n.pid):
            neighbour_to_playlist[n.pid] = []
        neighbour_to_playlist[n.pid].append(pid)

In [77]:
len(neighbour_to_playlist)

275898

In [78]:
def extract_tracks_from_training_data(path):
    filenames = os.listdir(path)
    count = 0
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            with open(fullpath) as f:
                js = f.read()
                mpd_slice = json.loads(js)
                for playlist in mpd_slice["playlists"]:
                    count += 1
                    if count % 1000 == 0:
                        print(count)
                    extract_tracks(playlist)

In [79]:
recommendations = {}
for pid in knn:
    recommendations[pid] = []

In [80]:
def extract_tracks(playlist):
    track_uris = set([t["track_uri"] for t in playlist["tracks"]])
    for test_pid in neighbour_to_playlist.get(playlist["pid"], []):
        not_common = track_uris - pid_to_set_tracks[test_pid]
        recommendations[test_pid].extend(not_common)

In [81]:
extract_tracks_from_training_data("dataset/spotify/data")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
15

In [82]:
with open("recommendations.json", "w") as f:
    json.dump(recommendations, f)

In [106]:
with open("recommendations.json", "r") as f:
    recommendations = json.load(f)

In [116]:
recs_compressed = {}

enough = 0

for pid, recs in recommendations.items():
    counts = collections.Counter()
    for r in recs:
        counts[r] += 1
#     print(len(counts))
    recs_compressed[pid] = [t for t, _ in counts.most_common(500)]
    if len(recs_compressed[pid]) == 500:
        enough += 1

print('Enough: ', enough)

Enough:  9169


In [110]:
def most_popular_tracks(path):
    counter = collections.Counter()
    
    with open(path, "r") as f:
        pls = json.load(f)
        
    for p in pls["playlists"]:
        for t in p["tracks"]:
            counter[f"spotify:track:{t}"] += 1

    return counter
                        
popular_tracks = most_popular_tracks("dataset/simplified.json")

In [117]:
# fill the holes in the playlists

pops = popular_tracks.most_common(1000)

for pid, recs in recs_compressed.items():
    if len(recs) != 500:
        seeds = pid_to_set_tracks[int(pid)]
        for x, _ in pops:
            if len(recs) == 500:
                break
            if not x in recs and not x in seeds:
                recs.append(x)

In [118]:
with open("knn_submission.csv", "w") as f:
    f.write("team_info,pepilipep,p.angelov99@gmail.com\n")
    for x, y in recs_compressed.items():
        recs_string = ",".join(y)
        f.write(f"{x},{recs_string}\n")