In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
import numpy as np
import pandas as pd
import json
from collections import defaultdict
import os
import io
import distutils.dir_util

from collections import Counter
from itertools import chain

In [43]:
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath(parent)

    with io.open(fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf-8') as f:
        json_obj = json.load(f)

    return json_obj

In [44]:
PATH = f'/content/drive/MyDrive/Test/'
genre_gn_all_dict = load_json(PATH + 'genre_gn_all.json')
song_meta_json = load_json(PATH + 'song_meta.json')

In [45]:
train = load_json(PATH + 'test.json')
val = load_json(PATH + 'val.json')
test = load_json(PATH + 'test.json')

data = train + val + test
data_df = pd.DataFrame(data)
data_df.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[],70107,,"[398985, 449403, 411543, 528044, 143048, 98020...",6,2012-09-29 01:57:26.000
1,"[나만의Best3, 인디아티스트들의추천음악]",7461,,"[196298, 269984, 267805, 175867, 529244, 63825...",0,2019-12-17 14:06:45.000
2,[드라이브],90348,,"[273433, 331003, 68432, 411659, 117793, 616860...",21,2015-05-23 10:44:48.000
3,[분위기],58617,,"[702227, 48152, 440008, 358488, 701041, 540721...",0,2019-03-14 09:47:34.000
4,[],102395,,"[630683, 481582, 528550, 285114, 506667, 17922...",38,2018-07-11 16:43:32.000


In [46]:
songs_df = pd.DataFrame(song_meta_json)
songs_df.head()

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2
3,"[GN1102, GN1101]",20151016,Feeling Right (Everything Is Nice) (Feat. Popc...,2644882,[838543],Feeling Right (Everything Is Nice) (Feat. Popc...,[GN1100],[Matoma],3
4,"[GN1802, GN1801]",20110824,그남자 그여자,2008470,[560160],그남자 그여자,[GN1800],[Jude Law],4


In [47]:
genres = {key: val for key, val in genre_gn_all_dict.items() if key[-2:] == '00'}
genres['GN0700'] = '트로트'

melon_genres = {"발라드": ['GN0100'], "댄스": ['GN0200'], "랩/힙합": ['GN0300', 'GN1200'], "R&B/Soul": ['GN0400', 'GN1300'], "인디음악": ['GN0500'], "록/메탈": ['GN0600', 'GN1000'], "트로트": ['GN0700'], "포크/블루스": ['GN0800', 'GN1400']}

In [48]:
songs_df['count'] = 0
songs_df['genre'] = songs_df['song_gn_gnr_basket'].map(lambda x: tuple(x)[0] if x else 'GN0000')
songs_df.head()

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id,count,genre
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0,0,GN0900
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1,0,GN1600
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2,0,GN0900
3,"[GN1102, GN1101]",20151016,Feeling Right (Everything Is Nice) (Feat. Popc...,2644882,[838543],Feeling Right (Everything Is Nice) (Feat. Popc...,[GN1100],[Matoma],3,0,GN1100
4,"[GN1802, GN1801]",20110824,그남자 그여자,2008470,[560160],그남자 그여자,[GN1800],[Jude Law],4,0,GN1800


In [50]:
songs_gb = songs_df.groupby('genre')
songs_PATH = PATH + 'Songs_by_Genres/'
if not os.path.exists(songs_PATH):
    os.mkdir(songs_PATH)

melon_genres_values = sum(melon_genres.values(), [])
for key, group in songs_gb:
    if key in melon_genres_values:
        _fix = genres[key]
        _fix = _fix.replace('/',  ' ')
        key += '_' + _fix
        group.to_json(songs_PATH + str(key) + '.json', force_ascii=False)

In [51]:
data_df['songs_counter'] = data_df['songs'].map(lambda x: Counter(x))

data_df.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,songs_counter
0,[],70107,,"[398985, 449403, 411543, 528044, 143048, 98020...",6,2012-09-29 01:57:26.000,"{398985: 1, 449403: 1, 411543: 1, 528044: 1, 1..."
1,"[나만의Best3, 인디아티스트들의추천음악]",7461,,"[196298, 269984, 267805, 175867, 529244, 63825...",0,2019-12-17 14:06:45.000,"{196298: 1, 269984: 1, 267805: 1, 175867: 1, 5..."
2,[드라이브],90348,,"[273433, 331003, 68432, 411659, 117793, 616860...",21,2015-05-23 10:44:48.000,"{273433: 1, 331003: 1, 68432: 1, 411659: 1, 11..."
3,[분위기],58617,,"[702227, 48152, 440008, 358488, 701041, 540721...",0,2019-03-14 09:47:34.000,"{702227: 1, 48152: 1, 440008: 1, 358488: 1, 70..."
4,[],102395,,"[630683, 481582, 528550, 285114, 506667, 17922...",38,2018-07-11 16:43:32.000,"{630683: 1, 481582: 1, 528550: 1, 285114: 1, 5..."


In [52]:
songs_counter = Counter()
for playlist in data:
    songs_counter += Counter(playlist['songs'])

In [67]:
songs_mc_counter = songs_counter.most_common()
songs_sub_df = songs_df[['id', 'genre']]


melon_genres_values = sum(melon_genres.values(), [])
top_songs = {genre: [] for genre in melon_genres_values}
print(top_songs)
for song_id, cnt in songs_mc_counter:
    _genre = songs_sub_df.iloc[song_id].genre
    try:
        top_songs[_genre].append((song_id, cnt))
    except KeyError:
        continue
print(top_songs)

{'GN0100': [], 'GN0200': [], 'GN0300': [], 'GN1200': [], 'GN0400': [], 'GN1300': [], 'GN0500': [], 'GN0600': [], 'GN1000': [], 'GN0700': [], 'GN0800': [], 'GN1400': []}
{'GN0100': [(144663, 317), (349492, 266), (650494, 266), (549178, 264), (627363, 257), (42155, 242), (463173, 242), (520093, 237), (396828, 237), (680366, 234), (427724, 234), (105140, 234), (448116, 232), (199262, 231), (675115, 230), (68348, 229), (187047, 228), (422915, 227), (582252, 226), (75842, 225), (215411, 222), (519391, 217), (509998, 216), (678762, 214), (668128, 213), (341513, 213), (442014, 212), (487911, 211), (643628, 210), (157055, 208), (209993, 206), (235773, 202), (339802, 197), (253755, 194), (648628, 192), (15318, 192), (108004, 189), (117595, 186), (351342, 184), (232874, 184), (407828, 183), (245317, 178), (457124, 178), (367963, 176), (95323, 175), (690604, 173), (521739, 171), (497066, 171), (118827, 171), (424813, 170), (640657, 170), (13142, 168), (166841, 166), (663905, 165), (358241, 165), 

In [68]:
top_20_songs = {genre: songs[:20] for genre, songs in top_songs.items()}

top_20_songs

{'GN0100': [(144663, 317),
  (349492, 266),
  (650494, 266),
  (549178, 264),
  (627363, 257),
  (42155, 242),
  (463173, 242),
  (520093, 237),
  (396828, 237),
  (680366, 234),
  (427724, 234),
  (105140, 234),
  (448116, 232),
  (199262, 231),
  (675115, 230),
  (68348, 229),
  (187047, 228),
  (422915, 227),
  (582252, 226),
  (75842, 225)],
 'GN0200': [(133484, 158),
  (326204, 130),
  (455668, 128),
  (345004, 124),
  (122363, 122),
  (138031, 106),
  (89620, 94),
  (26797, 90),
  (61595, 86),
  (354659, 82),
  (165434, 79),
  (461680, 77),
  (589292, 75),
  (506254, 74),
  (548896, 74),
  (431902, 73),
  (682550, 72),
  (430005, 71),
  (664555, 71),
  (420005, 71)],
 'GN0300': [(26083, 200),
  (377243, 182),
  (528278, 159),
  (237407, 151),
  (98472, 143),
  (454218, 142),
  (579592, 131),
  (156278, 128),
  (695494, 124),
  (590379, 123),
  (374617, 120),
  (451489, 119),
  (688951, 114),
  (679405, 114),
  (63976, 112),
  (347303, 109),
  (232305, 109),
  (275414, 107),
  (11

In [70]:
genres

{'GN0100': '발라드',
 'GN0200': '댄스',
 'GN0300': '랩/힙합',
 'GN0400': 'R&B/Soul',
 'GN0500': '인디음악',
 'GN0600': '록/메탈',
 'GN0700': '트로트',
 'GN0800': '포크/블루스',
 'GN0900': 'POP',
 'GN1000': '록/메탈',
 'GN1100': '일렉트로니카',
 'GN1200': '랩/힙합',
 'GN1300': 'R&B/Soul',
 'GN1400': '포크/블루스/컨트리',
 'GN1500': 'OST',
 'GN1600': '클래식',
 'GN1700': '재즈',
 'GN1800': '뉴에이지',
 'GN1900': 'J-POP',
 'GN2000': '월드뮤직',
 'GN2100': 'CCM',
 'GN2200': '어린이/태교',
 'GN2300': '종교음악',
 'GN2400': '국악',
 'GN2500': '아이돌',
 'GN2600': '일렉트로니카',
 'GN2700': 'EDM',
 'GN2800': '뮤직테라피',
 'GN2900': '뮤지컬',
 'GN3000': '크리스마스'}

In [69]:
melon_genres

{'R&B/Soul': ['GN0400', 'GN1300'],
 '댄스': ['GN0200'],
 '랩/힙합': ['GN0300', 'GN1200'],
 '록/메탈': ['GN0600', 'GN1000'],
 '발라드': ['GN0100'],
 '인디음악': ['GN0500'],
 '트로트': ['GN0700'],
 '포크/블루스': ['GN0800', 'GN1400']}

In [71]:
top_20_songs.keys()

dict_keys(['GN0100', 'GN0200', 'GN0300', 'GN1200', 'GN0400', 'GN1300', 'GN0500', 'GN0600', 'GN1000', 'GN0700', 'GN0800', 'GN1400'])

In [81]:
for key in top_20_songs.keys():
    tmp_PATH = PATH + 'Top_20_Songs/'
    if not os.path.exists(tmp_PATH):
        os.mkdir(tmp_PATH)

    for KOR_genre, genre_list in melon_genres.items():
        tmp_df = pd.DataFrame(columns=songs_df.columns)
        _prefix = ""
        for genre_code in genre_list:
            _prefix += '_' + genre_code
            for id, cnt in top_20_songs[genre_code]:
                song = songs_df[songs_df['id'] == id]
                song['count'] = cnt
                tmp_df = tmp_df.append(song)
        file_name = KOR_genre.replace('/', '-') + _prefix
        tmp_df.to_json(tmp_PATH + file_name + '.json', force_ascii=False)

    
    # tmp_df = pd.DataFrame(columns=song_df.columns)
    # for id, cnt in top_20_songs[key]:
    #     song = song_df[song_df['id'] == id]
    #     song['count'] = cnt
    #     tmp_df = tmp_df.append(song)
    
    # try:
    #     _fix = genre_gn_all_dict[key]
    #     _fix = _fix.replace('/',  ' ')
    # except KeyError:
    #     if key == 'GN0000':
    #         _fix = '장르없음'
    #     elif key == 'GN9000':
    #         _fix = '장르모름'
    
    # tmp_df.to_json(tmp_PATH + str(key) + '_' + _fix + '.json', force_ascii=False)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

In [82]:
TOP_20_PATH = PATH + 'Top_20_Songs/'
file_list = os.listdir(TOP_20_PATH)
file_list

['발라드_GN0100.json',
 '댄스_GN0200.json',
 '랩-힙합_GN0300_GN1200.json',
 'R&B-Soul_GN0400_GN1300.json',
 '인디음악_GN0500.json',
 '록-메탈_GN0600_GN1000.json',
 '트로트_GN0700.json',
 '포크-블루스_GN0800_GN1400.json']

In [83]:
target_name = '발라드_GN0100.json'
FILE_PATH = TOP_20_PATH + target_name
tmp = load_json(FILE_PATH)

tmp_df = pd.DataFrame(tmp)

tmp_df

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id,count,genre
144663,"[GN0105, GN0101]",20170324,밤편지,10047890,[261143],밤편지,[GN0100],[아이유],144663,317,GN0100
349492,"[GN0105, GN0101]",20170905,어떤이별,10093138,[869753],어떤이별,[GN0100],[임승부],349492,266,GN0100
650494,"[GN0105, GN0101]",20140408,봄 사랑 벚꽃 말고,2248659,"[767374, 261143]",봄 사랑 벚꽃 말고,[GN0100],"[HIGH4 (하이포), 아이유]",650494,266,GN0100
549178,"[GN0105, GN0101]",20131220,Modern Times - Epilogue,2222587,[261143],금요일에 만나요 (Feat. 장이정 Of HISTORY),[GN0100],[아이유],549178,264,GN0100
627363,"[GN0105, GN0101]",20160527,스틸,2687420,[420527],널 사랑하지 않아,[GN0100],[어반자카파],627363,257,GN0100
42155,"[GN0105, GN0101]",20171211,벙어리,10118270,[2018151],벙어리,[GN0100],[홍아],42155,242,GN0100
463173,"[GN0105, GN0101]",20190220,비가 내렸어,10252752,[2632126],비가 내렸어 (Vocal by 스티브언니),[GN0100],[업라이트 (Upright)],463173,242,GN0100
520093,"[GN0105, GN0101]",20190418,고마운 사람,10275236,[2632126],고마운 사람 (Vocal by 이소진),[GN0100],[업라이트 (Upright)],520093,237,GN0100
396828,"[GN0105, GN0101]",20170726,쉬운사랑,10082442,[1817010],쉬운사랑,[GN0100],[중신],396828,237,GN0100
680366,"[GN0105, GN0101]",20191105,끝내지 못한 이야기,10347322,[2753212],끝내지 못한 이야기 (Feat. 호수),[GN0100],[어쿠스틱 멜로디 (Acoustic Melody)],680366,234,GN0100
