In [1]:
from collections import defaultdict, Counter

import pandas as pd
from pathlib import Path

from utils.json import load_json
from utils.song_id import decode_song_id

In [2]:
song_df = pd.read_csv('kpop-dataset/song_list.csv')
chosen_df = pd.read_csv('kpop-dataset/csv/kpop_chosen.csv')

len(song_df), len(chosen_df)

(17254, 14730)

In [3]:
idx_to_remove = []
for i, row in song_df.iterrows():
    year, title, artist = row['year'], row['title'], row['track_artist']
    
    if chosen_df[(chosen_df['Year'] == year) & (chosen_df['Song'] == title) \
             & (chosen_df['Artist'] == artist)].empty:
        idx_to_remove.append(i)

song_df = song_df.drop(idx_to_remove)

In [4]:
counter = Counter(song_df['label'])
counter

Counter({'SM Entertainment': 3012,
         'JYP Entertainment': 1488,
         'YG Entertainment': 881,
         'FNC Entertainment': 810,
         'Cube Entertainment': 715,
         'Starship Entertainment': 598,
         'DSP Media': 590,
         'Rainbow Bridge World': 505,
         'Pledis Entertainment': 420,
         'Woollim Entertainment': 417,
         'Big Hit Entertainment': 302,
         'Jellyfish Entertainment': 294,
         'ICONIX': 241,
         'fantagio music': 229,
         'LOEN Entertainment': 223,
         'WAKEONE': 219,
         'Core Contents Media': 181,
         'NH Media': 161,
         'Source Music': 160,
         'Nega Network': 137,
         'Dreamcatcher Company': 124,
         'KQ Entertainment': 114,
         'BPM Entertainment': 113,
         'J. Tune Camp': 87,
         'MNH Entertainment': 82,
         'Seven Seasons': 76,
         'OFF THE RECORD Entertainment': 73,
         'MODHAUS': 70,
         'B2M Entertainment': 69,
         'Around US

In [5]:
def get_year_class(year):
    year = int(year)
    year = max(1995, year)
    year = min(2024, year)
    return (year - 1995) // 6

In [6]:
song_usage_json_path = Path('kpop-dataset/song_usage.json')
song_usage_dict = load_json(song_usage_json_path)

for data_type, song_id_dict in song_usage_dict.items():
    if data_type in ['train', 'test', 'valid']:
        ids_per_label = {}
        ids_per_year = defaultdict(list)
        for label, song_id_list in song_id_dict.items():
            ids_per_label[label] = set(song_id_list)
            for song_id in song_id_list:
                year, _, _ = decode_song_id(song_id)
                year_class = get_year_class(year)
                ids_per_year[year_class].append(song_id)
        
        for label, song_id_list in ids_per_label.items():
            print(f'{data_type} {label}: {len(song_id_list)}')
        for year, song_id_list in ids_per_year.items():
            print(f'{data_type} {year}: {len(song_id_list)}')

        print('-' * 50)

train SM: 837
train YG: 837
train JYP: 837
train HYBE: 837
train 0: 120
train 1: 357
train 2: 519
train 3: 1081
train 4: 1271
--------------------------------------------------
valid SM: 105
valid YG: 105
valid JYP: 105
valid HYBE: 105
valid 0: 11
valid 1: 48
valid 2: 64
valid 3: 137
valid 4: 160
--------------------------------------------------
test SM: 105
test YG: 105
test JYP: 105
test HYBE: 105
test 0: 13
test 1: 43
test 2: 74
test 3: 154
test 4: 136
--------------------------------------------------


In [8]:
from tester import test_model
test_model()

Loading train data: 100%|██████████| 3348/3348 [00:02<00:00, 1258.25it/s]
100%|██████████| 53/53 [00:03<00:00, 15.87it/s]


Test Loss: 1.3784
Test Label Accuracy: 0.3889
Test Year Accuracy: 0.3907
Total Examples: 3348


Loading valid data: 100%|██████████| 420/420 [00:01<00:00, 392.81it/s] 
100%|██████████| 7/7 [00:00<00:00, 17.86it/s]


Test Loss: 1.3780
Test Label Accuracy: 0.3619
Test Year Accuracy: 0.3881
Total Examples: 420


Loading test data: 100%|██████████| 420/420 [01:22<00:00,  5.11it/s]
100%|██████████| 90/90 [00:05<00:00, 16.78it/s]


Test Loss: 1.4118
Test Label Accuracy: 0.3599
Test Year Accuracy: 0.3814
Total Examples: 5727


In [None]:
# song_usage_json_path = Path('kpop-dataset/song_usage.json')
# song_usage_dict = load_json(song_usage_json_path)

# for data_type, song_id_dict in song_usage_dict.items():
#     if data_type in ['case_study']:
#         for case_name, song_id_list in song_id_dict.items():
#             if not case_name == ['artist']:
#                 continue
#             predicted_label_per_artist = {}
#             for song_id in song_id_list:
#                 year, title, _ = decode_song_id(song_id)
#                 year_class = get_year_class(year)
                
#                 predicted_label_per_artist[title] = year_class


#                 ids_per_year[year_class].append(song_id)
        
#         for label, song_id_list in ids_per_label.items():
#             print(f'{data_type} {label}: {len(song_id_list)}')
#         for year, song_id_list in ids_per_year.items():
#             print(f'{data_type} {year}: {len(song_id_list)}')

#         print('-' * 50)