# 플레이리스트 기반 노래 데이터 수집

In [862]:
import pandas as pd 
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
import json
import re
from tqdm import tqdm

## 플레이리스트 단어 검색 메인

In [724]:
# 한글을 unicode hex로 변환.
def keyword_to_unicode(keyword: 'str') -> str:
    keyword.encode('utf-8')
    regex = re.compile(r'\\x')
    keyword_unicode = re.sub(regex, '%', str(keyword.encode('utf-8'))[2:-1]).upper()
    return keyword_unicode

# 사용 안한 pest test code
def get_playlst_main(keyword: 'str' = '행복') -> dict:
    playlist_url = "https://www.melon.com/dj/djfinder/djfinder_informCnt.json"
    # keyword to unicode
    keyword_unicode = keyword_to_unicode(keyword)
    # Headers information
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'ko,en-US;q=0.9,en;q=0.8,zh;q=0.7',
        'Connection': 'keep-alive',
        'Content-Length': '81',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Host': 'www.melon.com',
        'Origin': 'https://www.melon.com',
        'Referer': f'https://www.melon.com/dj/djfinder/djfinder_inform.htm?djSearchType=T&djSearchKeyword={keyword_unicode}',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'
    }
    # payload information
    data = {
        'djSearchType':'T',
        'djSearchKeyword': keyword_unicode,
        'pagingFlag': 'Y',
        'tagSearchType': 'S'
    }
    # Request - Post
    response = requests.post(playlist_url, data, headers=headers)
    # json loads
    playlist_main = json.loads(response.text)
    return playlist_main

In [725]:
get_playlst_main('행복')

{'ORDERBY': 'POP',
 'djSearchKeyword': '#행복',
 'djSearchType': 'T',
 'pagingFlag': 'Y',
 'djPlylstList': [],
 'relationTagList': [],
 'djPlylstListTotCnt': 1033,
 'menuInfo': {'subMenu': 'DJ_FINDER',
  'mainMenu': 'DJ',
  'subMenuDpFlg': True,
  'menuTitle': '멜론DJ>DJ파인더>멜론',
  'menuLocation': '<a href="/dj/today/djtoday_list.htm" title="멜론DJ - 페이지 이동">멜론DJ</a> &gt; <strong><span class="none">현재 위치</span>DJ파인더</strong>'},
 'menuId': 67190101,
 'tagSearchType': 'S',
 'httpDomain': 'http://www.melon.com',
 'httpsDomain': 'https://www.melon.com',
 'staticDomain': 'https://static.melon.co.kr'}

## 관련 tag 수집

In [726]:
def get_related_tags(keyword: str = '행복') -> dict:
    related_tags = []
    result = {}
    playlist_tag_url = f'https://www.melon.com/dj/djfinder/djfinder_inform.htm?djSearchType=T&djSearchKeyword={keyword}'
    # Headers information    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
    }
    # Request - get
    response = requests.get(playlist_tag_url, headers = headers)
    html = response.text
    # parse text by beautifullsoup
    parse = bs(html, 'html.parser')
    # find imformations
    playlist_numbers_txt = parse.find_all("p", "text")
    related_tags_txt = parse.find_all("button", "tag_item")
    playlist_numbers = int(re.findall(r'[0-9]+', playlist_numbers_txt[0].text)[0])
    for idx in range(len(related_tags_txt)):
        related_tags.append(related_tags_txt[idx]['id'])
    # make return values
    result['keyword'] = keyword
    result['playlist_numbers'] = playlist_numbers
    result['related_tags_numbers'] = len(related_tags_txt)
    result['related_tags'] = related_tags
    
    return result


In [727]:
a = get_related_tags('두근')

In [728]:
a['keyword'], a['playlist_numbers'], a['related_tags_numbers']

('두근', 369, 30)

In [729]:
print(a['related_tags'])

['설렘', '사랑', '봄', '달달', '기분전환', '설레임', '감성', '카페', '기분좋은', '썸', '인디', '휴식', '연애', '고백', '몽글몽글', '국내', '달달해', '분위기', '매장', '청춘', '커플', '힙합', '설레이는', '힐링', '스윗', '짝사랑', '두근두근', '명곡모음', '국힙', '셀렘']


In [730]:
인기테마 = ['기분전환', '감성', '힐링', '드라이브', '사랑', '추억', '이별', '여행', '여름', '휴식',
        '운동', '비오는날', '분위기', '위로', '트렌디', '공부', '몽환', 'ASMR', '카페', '클럽', '매장', 
        '노래방', '버스', '라운지', '한강', '집', '지하철'
       ]

In [731]:
인기장르 = ['발라드', '힙합', '인디', '댄스', '뉴에이지', '알앤비', '재즈', '클래식', '록', 
        '팝', 'OST', 'EDM', 'CCM', 'JPOP', '트로트', '월드뮤직', '블루스', '컨트리']

In [732]:
인기태그 = ['환상', '느긋한', '페스티벌', '여름밤', '싱잉랩', '생각', '흐림', '잔잔한', '설렘']

In [733]:
#major sentiment : happy, angry, peaceful, sad
감정태그1 = ['행복한', '화나는', '슬픈', '평온한']

In [734]:
#Excited, happy, contented, calm, fatigue, depressed, sad, disgusted, angry, nervous, fear, surprised
감정태그2 = ['설레는', '행복한', '만족하는', '침착한', '피로한', '우울한', '슬픈', '역겨운', '화난', '불안한', '두려운', '놀란']

In [735]:
tag_df = pd.DataFrame(get_related_tags('행복'))

In [736]:
tag_df

Unnamed: 0,keyword,playlist_numbers,related_tags_numbers,related_tags
0,행복,1033,30,기분전환
1,행복,1033,30,사랑
2,행복,1033,30,감성
3,행복,1033,30,휴식
4,행복,1033,30,힐링
5,행복,1033,30,드라이브
6,행복,1033,30,설렘
7,행복,1033,30,카페
8,행복,1033,30,주말
9,행복,1033,30,신나는


In [865]:
def get_all_related_tags(keyword_list: list = 인기테마):
    for idx, keyword in enumerate(tqdm(keyword_list)):
        if idx == 0:
            tag_df = pd.DataFrame(get_related_tag(keyword))
        else:
            tag_df = pd.concat([tag_df, pd.DataFrame(get_related_tag(keyword))])
    return tag_df

In [866]:
tag_df = get_all_related_tags()

100%|██████████| 27/27 [00:09<00:00,  2.76it/s]


In [867]:
tag_df

Unnamed: 0,keyword,playlist_numbers,related_tags_numbers,related_tags
0,기분전환,25078,30,드라이브
1,기분전환,25078,30,신나는
2,기분전환,25078,30,휴식
3,기분전환,25078,30,감성
4,기분전환,25078,30,여행
...,...,...,...,...
25,지하철,276,30,일상
26,지하철,276,30,매장
27,지하철,276,30,공부
28,지하철,276,30,사랑


## 플레이리스트 수집기

In [870]:
 def get_playlist(keyword: str = '행복', page_no: int = 0, total: bool = True):
    # keyword to unicode
    keyword_unicode = keyword_to_unicode(keyword)
    # Headers information    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
    }
    titles = []
    # get all playlists
    if total == True:
        # get tag numbers
        playlist_tag_url = f'https://www.melon.com/dj/djfinder/djfinder_inform.htm?djSearchType=T&djSearchKeyword={keyword}'
        # Request - Post
        response_tag = requests.get(playlist_tag_url, headers=headers)
        # beautifull suop loads
        html_tag = response_tag.text
        # parse text by beautifullsoup
        parse_tag = bs(html_tag, 'lxml')
        # find imformations
        playlist_numbers_txt = parse_tag.find_all("p", "text")
        playlist_numbers = int(re.findall(r'[0-9]+', playlist_numbers_txt[0].text)[0])
        
        print(keyword,"키워드로 ",playlist_numbers,"전체 플레이리스트 수집을 시작합니다.")
        for page_no in range(playlist_numbers//20 + 1):
            startIndex = page_no*20 + 1
            playlist_url = f"https://www.melon.com/dj/djfinder/djfinder_inform.htm?startIndex={startIndex}&pageSize=20&djSearchType=T&djSearchKeyword=%23{keyword_unicode}&orderBy=POP&pagingFlag=Y&tagSearchType=S"
            # Request - Post
            response = requests.get(playlist_url, headers=headers)
            # beautifull suop loads
            html = response.text
            # parse text by beautifullsoup
            parse = bs(html, 'lxml')
            # find imformations
            titles_txt = parse.find_all("a", "ellipsis album_name")
            # titles extend
            titles.extend(titles_txt)
            print("*", end="")
    # get playlist page        
    else:
            print(keyword,"키워드로 ",page_no,"번 페이지 수집을 시작합니다.")
            startIndex = page_no*20 + 1
            playlist_url = f"https://www.melon.com/dj/djfinder/djfinder_inform.htm?startIndex={startIndex}&pageSize=20&djSearchType=T&djSearchKeyword=%23{keyword_unicode}&orderBy=POP&pagingFlag=Y&tagSearchType=S"
            # Request - Post
            response = requests.get(playlist_url, headers=headers)
            # beautifull suop loads
            html = response.text
            # parse text by beautifullsoup
            parse = bs(html, 'lxml')
            # find imformations
            titles_txt = parse.find_all("a", "ellipsis album_name")
            # titles extend
            titles.extend(titles_txt)
    
    # make dataframe 
    df = pd.DataFrame()
    plylstSeq_list = []
    title_list = []
    for title_no in range(len(titles)):
        plylstSeq_list.append(re.findall(r'[0-9][0-9][0-9]+', titles[title_no]['href'])[0])
        title_list.append(titles[title_no]['title'][:-5])
    
    df['plylstSeq'] = plylstSeq_list
    df['title'] = title_list
    df['tag'] = keyword
    print("")
    print("complete.")
    return df

In [871]:
playlist_df = get_playlist(keyword='행복', page_no=0, total=True)

행복 키워드로  1033 전체 플레이리스트 수집을 시작합니다.
****************************************************
complete.


In [872]:
playlist_df

Unnamed: 0,plylstSeq,title,tag
0,495119848,나도 모르게 미소가 지어지는 싱그러운 여름날의 JAZZ,행복
1,444702404,언제 들어도 기분 좋아지는 멜로디의 POP,행복
2,504995932,새벽 비행기에서 듣고 싶은 몽글몽글한 감성 POP,행복
3,511123665,"여유로운 아침, 따스한 햇빛이 기분 좋게 나를 깨울 때",행복
4,492052479,몽글몽글 첫사랑의 기억 -`♡´- Korea High Teen ✧,행복
...,...,...,...
1028,512649653,카페에서 들으면 잠시 감상하게 되는 노래,행복
1029,505616635,듣기만 해도 기분 up되는 걸그룹 모음,행복
1030,505287701,"집콕 주말, 여유롭게 듣기 좋은 ᴘᴏᴘ",행복
1031,511038158,행복지수 UP! 피곤한 발걸음이 가벼워지는 기분 좋은 디즈니 OST ◡̎,행복


In [875]:
def get_all_playlist(keyword_list: list = ['행복','사랑','슬픔']):
    df = pd.DataFrame()
    for idx, keyword in enumerate(tqdm(keyword_list)):
        print(idx, keyword)
        if(idx == 0):
            df = get_playlist(keyword, 0, True)
        else:
            df = pd.concat([df, get_playlist(keyword, 0, True)])
    return df

In [876]:
all_playlist_df = get_all_playlist()

  0%|          | 0/3 [00:00<?, ?it/s]

0 행복
행복 키워드로  1033 전체 플레이리스트 수집을 시작합니다.
**************************************************

 33%|███▎      | 1/3 [00:08<00:16,  8.44s/it]

**
complete.
1 사랑
사랑 키워드로  10984 전체 플레이리스트 수집을 시작합니다.
*********************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

 67%|██████▋   | 2/3 [02:00<01:09, 69.44s/it]

*
complete.
2 슬픔
슬픔 키워드로  4460 전체 플레이리스트 수집을 시작합니다.
*******************************************************************************************************************************************************************************************************************************

100%|██████████| 3/3 [02:50<00:00, 56.69s/it]

*
complete.





In [877]:
all_playlist_df

Unnamed: 0,plylstSeq,title,tag
0,495119848,나도 모르게 미소가 지어지는 싱그러운 여름날의 JAZZ,행복
1,444702404,언제 들어도 기분 좋아지는 멜로디의 POP,행복
2,504995932,새벽 비행기에서 듣고 싶은 몽글몽글한 감성 POP,행복
3,511123665,"여유로운 아침, 따스한 햇빛이 기분 좋게 나를 깨울 때",행복
4,492052479,몽글몽글 첫사랑의 기억 -`♡´- Korea High Teen ✧,행복
...,...,...,...
4455,419801038,상처받은 내마음을 표류하는중....,슬픔
4456,412508170,<<취향저격!!>> 이별노래만 모았어요,슬픔
4457,401801099,이별 후 공감가는 가사와 멜로디,슬픔
4458,504824242,"내 이야기인 것만 같은 가사, 마음을 후벼파는 짝사랑 감성 노래",슬픔


In [878]:
all_playlist_df[all_playlist_df['title'].duplicated()]

Unnamed: 0,plylstSeq,title,tag
50,507061702,"사랑, 설레임, 기쁨, 행복을 느끼고 싶을 때 같이 들어요.",행복
14,492052479,몽글몽글 첫사랑의 기억 -`♡´- Korea High Teen ✧,사랑
56,504697242,시끄럽지 않은 둠칫한 국내음악 TOP100,사랑
77,403667077,결혼 축가로 부르면 좋을 것 같은 곡,사랑
86,470783092,사랑하는 사람과 함께 듣고픈 국내 발라드,사랑
...,...,...,...
4261,423806162,우울할 때 들으면 더 우울한 노래,슬픔
4355,408404721,이별노래,슬픔
4381,100034625,이별 그리고 그리움,슬픔
4421,100006500,〃이별後愛〃,슬픔


## playlist 곡 수집

https://www.melon.com/mymusic/dj/mymusicdjplaylistview_inform.htm?plylstSeq=511975096

In [879]:
def get_playlist_songs(plylstseq: str):
    result = {}
    playlist_url = f'https://www.melon.com/mymusic/dj/mymusicdjplaylistview_inform.htm?plylstSeq={plylstseq}'
    # Headers information    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
    }
    # Request - get
    response = requests.get(playlist_url, headers = headers)
    html = response.text
    # parse text by beautifullsoup
    parse = bs(html, 'html.parser')
    # find imformations
    song_txt = parse.find_all("div", "wrap_song_info", class_="ellipsis rank01")
    singer_txt = parse.find_all("div", "wrap_song_info", class_="ellipsis rank02")
    album_txt = parse.find_all("div", "wrap_song_info", class_="ellipsis rank03")

    song_code = []
    song_name = []
    singer_code = []
    singer_name = []

    for idx, song in enumerate(song_txt):
        song_code.append(song_txt[idx].find("a")['href'].split(',')[1][:-2])
        song_name.append(song_txt[idx].find("a")['title'][:-3])
        if singer_txt[idx].find("a"):
            singer_code.append(singer_txt[idx].find("a")['href'].split('\'')[1])
            singer_name.append(singer_txt[idx].find("a")['title'][:-9])
        else:
            singer_code.append('')
            singer_name.append('Various Artists')

    # make return values
    result['song_code'] = song_code
    result['song_name'] = song_name
    result['singer_code'] = singer_code
    result['singer_name'] = singer_name
    result['plylstseq'] = plylstseq

    return pd.DataFrame(result)


'480931922', '512990548'

In [851]:
songs_df = get_playlist_songs('480931922')

In [852]:
songs_df

Unnamed: 0,song_code,song_name,singer_code,singer_name,plylstseq
0,32333811,I Love Today,713743.0,7pm,480931922
1,31831071,Little Bird (tvN '스페인 하숙' 삽입곡),618273.0,물고기꿈,480931922
2,31622729,Morning Diary,38547.0,해리,480931922
3,8147021,새콤 달콤한 행복,944561.0,달빛바다,480931922
4,32227785,냥이의 숨바꼭질,943648.0,From Paris,480931922
5,1370395,Dance Of The Dragonfly,2078.0,Kevin Kern,480931922
6,32465462,Perfect Day,713920.0,Shizuko Mori,480931922
7,332494,Butterfly Waltz,2029.0,Brian Crain,480931922
8,8209025,Sunny Days,944395.0,서른의 꿈,480931922
9,7998989,둘이서 걷던 길,905398.0,레인시티 (RainCity),480931922


In [880]:
def get_all_playlist_songs(plylstseq_list: list):
    playlist_songs_df = pd.DataFrame()
    for idx, plylstseq in enumerate(tqdm(plylstseq_list)):
        if(idx == 0):
            playlist_songs_df = get_playlist_songs(plylstseq)
        else:
            playlist_songs_df = pd.concat([playlist_songs_df, get_playlist_songs(plylstseq)])
    return playlist_songs_df

In [854]:
print(all_playlist_df['plylstSeq'].tolist())

['495119848', '444702404', '504995932', '511123665', '492052479', '511098435', '499744964', '508626623', '480931922', '512990548', '460627349', '504697242', '512843101', '403667077', '470783092', '491701740', '429597019', '509261613', '432074969', '457526054', '437003745', '496147745', '492762296', '457119905', '495603593', '438695292', '501930129', '493846263', '459865865', '470671906', '499155892', '448130991', '504255659', '497332006', '450981992', '485757846', '457906612', '497982591', '512785688', '490239006', '463212469', '429411613', '492405684', '507552918', '512635285', '495294839', '467347581', '493676823', '509114907', '501362801', '507061702', '507239888', '495286165', '511638505', '509786790', '508399946', '417491293', '478197719', '510121471', '511813927', '457486188', '456726993', '493278047', '450613215', '495644164', '484253015', '442905246', '486373525', '507941928', '477950156', '479084166', '463139663', '462415182', '505794177', '457957059', '512056129', '512679258'

In [886]:
all_songs_df = get_all_playlist_songs(all_playlist_df['plylstSeq'].tolist()[:100])

100%|██████████| 100/100 [01:01<00:00,  1.63it/s]


In [887]:
all_songs_df

Unnamed: 0,song_code,song_name,singer_code,singer_name,plylstseq
0,32189652,Do It The Hard Way,7267,Chet Baker,495119848
1,8004369,Summer Samba,674648,Laura Ann,495119848
2,4208989,Desafinado (Original Mix),722669,Brazil Beat,495119848
3,35207596,Only You,16863,최성권,495119848
4,4543944,Cheek To Cheek,765473,JazzMaTazz,495119848
...,...,...,...,...,...
15,4499981,내 주는 살아 계시고,,Various Artists,491491277
16,989159,빈 들에 마른 풀같이,100447,국립합창단 (The National Chorus of Korea),491491277
17,3585841,예수는 나의 힘이요,,Various Artists,491491277
18,5381485,은혜가 풍성한 하나님은,,Various Artists,491491277


In [885]:
all_songs_df[all_songs_df['singer_name'] == 'Various Artists']

Unnamed: 0,song_code,song_name,singer_code,singer_name,plylstseq
23,7877641,Summer (영화 '기쿠지로의 여름'),,Various Artists,480931922
13,4093116,Debussy : Children's Corner L.113 - VI. Golliw...,,Various Artists,432074969
21,4076178,Tchaikovsky : The Nutcracker Suite Op.71a - VI...,,Various Artists,432074969


## 곡 수집

In [None]:
import re
import requests
from bs4 import BeautifulSoup as bs

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}

def get_song_list(query):
    url = f"https://www.melon.com/search/total/index.htm?q={query}"
    response = requests.get(url, headers=headers)
    page = bs(response.content, "lxml")
    form = page.find("form", id="frm_songList")
    if not form:
        form = page.find("form", id="frm_searchSong")
    if not form:
        form = page.find("form", id="frm_searchArtist")
    datas = form.find("tbody").find_all("tr")
    song_list = []
    size = len(datas)
    for i, data in enumerate(datas):
        song = data.find("a", class_="fc_gray")
        if song:
            song_id = song["href"].split(",")[-1].rstrip(");")
            data = get_song_info(song_id)
            song_list.append(data)
    return song_list


def get_song_info(song_id):
    url = f"https://www.melon.com/song/detail.htm?songId={song_id}"
    response = requests.get(url, headers=headers)
    page = bs(response.content, "html.parser")
    artists = page.find("div", class_="artist").find_all("a", class_="artist_name")
    cover_link = page.find("a", class_="image_typeAll").find("img")["src"]
    cover_link = re.sub("/282/", "/512/", cover_link)
    cover_link = re.sub("/80/", "/100/", cover_link)
    result = {
        "song_id": song_id,
        "song_title": page.find("div", class_="song_name")
        .get_text(strip=True)
        .lstrip("곡명"),
        "artist": ", ".join([artist["title"] for artist in artists]),
        "album_cover": cover_link,
        "album_title": re.sub(
            "[\\xa0]",
            " ",
            page.find("div", class_="meta").find("a").get_text(strip=True),
        ),
        "public_date": page.find("div", class_="meta")
        .find_all("dd")[1]
        .get_text(strip=True),
        "genre": page.find("div", class_="meta")
        .find_all("dd")[2]
        .get_text(strip=True)
        .split(", ")[0],
        "lyrics": page.find(class_="wrap_lyric")
        .get_text(separator="\n", strip=True)
        .rstrip("펼치기 "),
    }

    return result