In [1]:
from moviepy.editor import AudioFileClip, CompositeAudioClip, concatenate_audioclips
from glob import glob
from math import floor
import os

org_files = list(glob('/workspace/demo/org/*'))
noise_files = list(glob('/workspace/demo/noise/*'))
compose_dir = '/workspace/demo/noised_comp'

print(org_files)
print(noise_files)

org_clips = [AudioFileClip(f) for f in org_files]
noise_clips = [AudioFileClip(f) for f in noise_files]

max_org_dur = max([c.duration for c in org_clips])
os.makedirs(compose_dir, exist_ok=True)
for i in range(len(noise_clips)):
    ns_clip = noise_clips[i]
    loop = floor(max_org_dur / ns_clip.duration)
    noise_clips[i] = concatenate_audioclips([ns_clip for _ in range(loop)])




['/workspace/demo/org/Scarborough.wav', '/workspace/demo/org/Sky Of More.mp3', '/workspace/demo/org/Darling, Without You.mp3', "/workspace/demo/org/He Loves I Don't Care.mp3", '/workspace/demo/org/晚风心里吹.mp3', '/workspace/demo/org/Apology Of Dreams.mp3', '/workspace/demo/org/Home Rhythm.mp3', "/workspace/demo/org/Babe, We're Crazy.mp3", '/workspace/demo/org/Best Moves.mp3', '/workspace/demo/org/Open Up To Her Tomorrow.mp3', '/workspace/demo/org/千里之外.mp3', '/workspace/demo/org/Dream Of My Home.mp3', '/workspace/demo/org/Season Of My Heart.mp3']
['/workspace/demo/noise/增强版黑胶唱片底噪.aiff', '/workspace/demo/noise/尘封的磁带录音.wav', '/workspace/demo/noise/黑胶唱片播放底噪.wav', '/workspace/demo/noise/空白黑胶唱片播放底噪.wav', '/workspace/demo/noise/黑胶唱片噪音.flac']


In [20]:
for org_fp, org_clip in zip(org_files, org_clips):
    for ns_fp, ns_clip in zip(noise_files, noise_clips):
        org_name = org_fp.split('/')[-1].split('.')[0]
        ns_name = ns_fp.split('/')[-1].split('.')[0]
        comp_fp = os.path.join(compose_dir, f"{org_name}-{ns_name}.mp3")
        comp_clip = CompositeAudioClip([org_clip.set_start(1.5), ns_clip.subclip(0, org_clip.duration)])
        comp_clip.write_audiofile(comp_fp, org_clip.fps, logger=None)
            



In [None]:
from glob import glob
import subprocess
compose_dir = '/workspace/demo/noised_comp'

def low_bit(fp, bitrate):
    lb_fp = fp[:-4] + f'-br_{bitrate}.mp3'
    cmd = f"ffmpeg -y -i {fp} -b:a {bitrate}k {lb_fp}"
    subprocess.run(cmd, shell=True)

for fp in glob(compose_dir+'/*.mp3'):
    if '-br_' in fp:
        continue
    low_bit(fp, 32)
    low_bit(fp, 64)
    low_bit(fp, 128)

    # lb_fp = fp[:-4] + '-br_64.mp3'
    # cmd = f"ffmpeg -y -i {fp} -b:a 64k {lb_fp}"
    # subprocess.run(cmd, shell=True)
    break


In [3]:
# convert tone.js and midi

import json
import mido
import requests
import numpy as np

raw_json = json.loads(open('/workspace/output.json', 'r').read())
params = eval(raw_json)
print(params)



{'title': '001: #806805703861057941268031', 'key': 12, 'mode': 6, 'bpm': 78, 'energy': 0.491, 'valence': 0.495, 'chords': [1, 5, 4, 5, 1, 5, 4, 3, 4], 'melodies': [[1, 1, 1, 3, 3, 3, 3, 3], [3, 2, 2, 2, 2, 1, 1, 1], [1, 1, 1, 1, 1, 1, 3, 3], [5, 5, 5, 5, 5, 5, 5, 5], [5, 1, 5, 5, 5, 5, 5, 5], [5, 5, 2, 2, 2, 2, 1, 1], [1, 1, 1, 1, 1, 1, 1, 5], [5, 5, 5, 5, 5, 9, 10, 10], [10, 10, 10, 10, 10, 0, 0, 0]]}


In [19]:
from math import floor, pow
import re
from collections import namedtuple

NoteLiteral = namedtuple('NoteLiteral', ['note_name', 'pitch', 'named'])


FIFTHS = [0, 2, 4, -1, 1, 3, 5]
STEPS_TO_OCTS = list(map(FIFTHS, lambda f: floor(f*7) / 12))
SEMI = [0, 2, 4, 5, 7, 9, 11]


def encode_pitch(step, alt, oct, dir=1):
    f = FIFTHS[step] + 7 * alt
    if oct is None:
        return dir * f

    o = oct - STEPS_TO_OCTS[step]
    return dir * f, dir * o


note_pat = re.compile(r"/^([a-gA-G]?)(#{1,}|b{1,}|x{1,}|)(-?\d*)\s*(.*)$/")


def tokenize_note(note):
    res = re.findall(note, note_pat).groups()
    return res[1].upper(), res[2].replace('/x/g', "##"), res[3], res[4]


def parse(note_name):
    tokens = tokenize_note(note_name)

    if tokens[0] == '' or tokens[3] != '':
        return None

    letter = tokens[0]
    step = (letter.encode('unicode-escape') + 3 ) %7

    acc = tokens[1]
    oct_str = tokens[2]
    alt = -1 * len(acc) if acc[0] == 'b' else len(acc)
    oct = len(oct_str) if oct_str else None

    coord = encode_pitch(step, alt, oct)
    name = letter + acc + oct_str
    pc = letter + acc
    chroma = (SEMI[step]) % 12
    # const mod = (n: number, m: number) => ((n % m) + m) % m;
    mod = lambda n, m: ((n % m) + m ) % m
    height = mod(SEMI[step] + alt, 12) - 12 * 99 if oct is None else SEMI[step] + alt + 12 * (oct + 1)

    midi = height if 0 <= height <= 127 else None
    freq = None if oct is None else pow(2, (height - 69) / 12) * 440

    return acc, alt, chroma, coord, freq, height, letter, midi, name, oct, pc, step


def get_note(name: str):
    if isinstance(name, str):
        parse(name)

    else:
        if name is pitch_name:
            get_note(get_pitch(src))

        else:
            if name is named:
                note(src.name)

            else:
                no_note
    return ''


def tokeniz_key(name):
    if not isinstance(name, str):
        return "", ""

    frags = name.split(' ')
    tonic = get_note(frags[0].lower())
    if tonic is None:
        n = get_note(name)
        return "", name if n is None else n, ""

    tp = name.lower()[: len(tonic)+1]
    return tonic, tp if len(tp) else ""


def get_scale_type():
    pass


def get_interval():
    pass


def coord2note():
    pass


def transpose(note_name, interval_name):
    note = get_note(note_name)
    interval = get_interval(interval_name)

    if note is None or interval is None:
        return ""

    note_coord = note.coord
    interval_coord = interval.coord
    tr = [note_coord[0] + interval_coord[0]] if len(note_coord) == 1 else [note_coord[0] + interval_coord[0], note_coord[1] + interval_coord[1]]
    return coord2note(tr).name


    
def get_scale(scale_name: str):
    tokens = tokeniz_key(scale_name)
    tonic = get_note(tokens[0])[8]
    st = get_scale_type(tokens[1])

    if st is None:
        return None

    tp = st.name
    notes = map(st.intervals, lambda x: transpose(tonic, x)) if tonic is not None else []
    


def get_tonic_by_key(src: str):
    return get_scale('C chromatic').notes[key - 1]
    


bpm = params['bpm'] // 5 * 5
bpm = min(max(bpm, 70), 100)
tonic = ''

key = params['key']

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time
import json
import re
import string

website = 'https://www.hooktheory.com'
base_url = website + '/theorytab/artists/'
sleep_time = 0.11
# alphabet_list = string.ascii_lowercase
alphabet_list = 'x'
root_dir = '../datasets'
root_xml = '../datasets/xml'


def song_retrieval(artist, song, path_song):

    suffix = '/theorytab/view/' + artist + '/' + song
    song_url = song_url = 'https://www.hooktheory.com' + suffix
    response_song = requests.get(song_url)

    soup = BeautifulSoup(response_song.text, 'html.parser')

    section_list = [item['href'].split('#')[-1] for item in soup.find_all('a', {'href': re.compile(suffix+'#')})]
    pk_list = [item['href'].split('/')[-1] for item in soup.find_all('a', {'href': re.compile("/theorytab/chords/pk/")})]

    # save xml
    for idx, pk in enumerate(pk_list):
        req_url = 'https://www.hooktheory.com/songs/getXmlByPk?pk=' + str(pk)
        response_info = requests.get(req_url)
        content = response_info.text

        with open(os.path.join(path_song, section_list[idx] + ".xml"), "w", encoding="utf-8") as f:
            f.write(content)
        time.sleep(0.08)

    # get genre
    wikiid = soup.findAll("multiselect", {"items": "genres"})[0]['wikiid']
    response_genre = requests.get('https://www.hooktheory.com/wiki/' + str(wikiid) + '/genres')
    genre_act_list = json.loads(response_genre.text)
    genres = []
    for g in genre_act_list:
        if g['active']:
            genres.append(g['name'])

    # saving
    info = {'section': section_list, 'pk': pk_list, 'song_url': song_url,
            'genres': genres, 'wikiid': wikiid}

    with open(os.path.join(path_song, 'song_info.json'), "w") as f:
        json.dump(info, f)


def get_song_list(url_artist, quite=False):
    response_tmp = requests.get(website + url_artist)
    soup = BeautifulSoup(response_tmp.text, 'html.parser')
    item_list = soup.find_all("li", {"class": re.compile("overlay-trigger")})

    song_name_list = []
    for item in item_list:
        song_name = item.find_all("a", {"class": "a-no-decoration"})[0]['href'].split('/')[-1]
        song_name_list.append(song_name)
        if not quite:
            print('   > %s' % song_name)
    return song_name_list


def traverse_website():
    '''
    Retrieve all urls of artists and songs from the website
    '''

    list_pages = []
    archive_artist = dict()
    artist_count = 0
    song_count = 0

    for ch in alphabet_list:
        time.sleep(sleep_time)
        url = base_url + ch
        response_tmp = requests.get(url)
        soup = BeautifulSoup(response_tmp.text, 'html.parser')
        page_count = 0

        print('==[%c]=================================================' % ch)

        # get artists list by pages
        url_artist_list = []
        for page in range(1, 9999):
            url = 'https://www.hooktheory.com/theorytab/artists/'+ch+'?page=' + str(page)
            print(url)
            time.sleep(sleep_time)
            response_tmp = requests.get(url)
            soup = BeautifulSoup(response_tmp.text, 'html.parser')
            item_list = soup.find_all("li", {"class": re.compile("overlay-trigger")})

            if item_list:
                page_count += 1
            else:
                break

            for item in item_list:
                url_artist_list.append(item.find_all("a", {"class": "a-no-decoration"})[0]['href'])

        print('Total:', len(url_artist_list))

        print('----')

        if not page_count:
            page_count = 1

        # get song of artists
        artist_song_dict = dict()

        for url_artist in url_artist_list:
            artist_count += 1
            time.sleep(sleep_time)
            artist_name = url_artist.split('/')[-1]
            print(artist_name)
            song_name_list = get_song_list(url_artist)
            song_count += len(song_name_list)
            artist_song_dict[artist_name] = song_name_list

        archive_artist[ch] = artist_song_dict
        list_pages.append(page_count)

    print('=======================================================')
    print(list_pages)
    print('Artists:', artist_count)
    print('Songs:', song_count)

    archive_artist['num_song'] = song_count
    archive_artist['num_artist'] = artist_count

    return archive_artist


if __name__ == '__main__':

    archive_artist = traverse_website()

    if not os.path.exists(root_dir):
        os.makedirs(root_dir)

    if not os.path.exists(root_xml):
        os.makedirs(root_xml)

    path_artists = os.path.join(root_dir, 'archive_artist.json')
    with open(path_artists, "w") as f:
        json.dump(archive_artist, f)

    with open(path_artists, "r") as f:
        archive_artist = json.load(f)

    count_ok = 0
    song_count = archive_artist['num_song']

    for ch in alphabet_list:
        path_ch = os.path.join(root_xml, ch)
        print('==[%c]=================================================' % ch)
        
        if not os.path.exists(path_ch):
            os.makedirs(path_ch)

        for a_name in archive_artist[ch].keys():
            for s_name in archive_artist[ch][a_name]:

                try:
                    print('(%3d/%3d) %s   %s' % (count_ok, song_count, a_name, s_name))
                    path_song = os.path.join(path_ch, a_name, s_name)

                    if not os.path.exists(path_song):
                        os.makedirs(path_song)

                    time.sleep(sleep_time)
                    song_retrieval(a_name, s_name, path_song)

                    count_ok += 1

                except Exception as e:
                    print(e)

    print('total:', count_ok)

In [8]:
import requests
import time
import re

GENRES_BASE_URL = 'https://www.hooktheory.com/theorytab/genres'
genres = "Alt-Country, Alternative, Blues, Childrens, Classical, Country, Dance, Disney, Electronic, Experimental, Folk, Hip-Hop-Rap, Holiday, House, Indie, J-Pop, Jazz, K-pop, Latin, Metal, Pop, Punk, R-and-B, Reggae, Rock, Singer-Songwriter, Soul, Soundtrack, Techno, Video-Game, Vocal, World, Worship".lower().strip().replace(' ','').split(',')
picked_set = {'blues', 'jazz', 'r-and-b', 'rock', 'hip-hop-rap', 'punk', 'metal'}

# <p class="song">Ghost Of Days Gone By</p><p class="artist">
pat = re.compile(r'<p class="song">(?P<songs>.*?)</p><p class="artist">by (?P<artist>.*?)</p>')

genre2song_and_artist = {}
pat = re.compile(r'<p class="song">(?P<songs>.*?)</p><p class="artist">by (?P<artist>.*?)</p>')

for p in genres:
    assert p in genres, p
    song_and_artist = []
    url = f"{GENRES_BASE_URL}/{p}"
    urls = [url]
    page_cnt = 1
    resp = requests.get(url)
    assert resp.status_code == 200, (p, url)
    html = resp.text
    while True:
        page_cnt += 1
        page_url = f"{url}?page={page_cnt}"
        if page_url.split('/', 3)[-1] in html:
            urls.append(page_url)
        else:
            print(f"{page_url} not found")
            break

    print(f"{len(urls)} pages found for {p}")

    for l in urls:
        resp = requests.get(l)
        assert resp.status_code == 200, (p, url)
        song_and_artist.extend(re.findall(pat, resp.text))    
    genre2song_and_artist[p] = song_and_artist
    print(f' getting {len(song_and_artist)} for {p}')
    time.sleep(0.5)

https://www.hooktheory.com/theorytab/genres/alt-country?page=2 not found
1 pages found for alt-country
 getting 78 for alt-country
https://www.hooktheory.com/theorytab/genres/alternative?page=7 not found
6 pages found for alternative
 getting 600 for alternative
https://www.hooktheory.com/theorytab/genres/blues?page=3 not found
2 pages found for blues
 getting 163 for blues
https://www.hooktheory.com/theorytab/genres/childrens?page=3 not found
2 pages found for childrens
 getting 126 for childrens
https://www.hooktheory.com/theorytab/genres/classical?page=5 not found
4 pages found for classical
 getting 337 for classical
https://www.hooktheory.com/theorytab/genres/country?page=4 not found
3 pages found for country
 getting 213 for country
https://www.hooktheory.com/theorytab/genres/dance?page=7 not found
6 pages found for dance
 getting 600 for dance
https://www.hooktheory.com/theorytab/genres/disney?page=3 not found
2 pages found for disney
 getting 119 for disney
https://www.hooktheo

In [10]:
import json
cnt = 0
for k, v in genre2song_and_artist.items():
    cnt += len(v)
print(cnt)

json.dump(genre2song_and_artist, open('/workspace/model/dataset/genre2song.json', 'w'), indent=4)

10836


In [33]:
from glob import glob
dfiles = list(glob(f"/workspace/model/dataset/processed/*.json"))
print(len(dfiles))
dsongs = set([f.split('/')[-1].split('.')[0].split('-')[1].lower().replace(' ', '') for f in dfiles])
print(len(dsongs))
for gnr, art_and_song in genre2song_and_artist.items():
    cnt = 0
    for s in art_and_song:
        if s[0].lower().replace(' ', '') in dsongs:
            cnt += 1

    print(f"{gnr}: {cnt}/{len(art_and_song)}")


17551
9924
alt-country: 58/78
alternative: 264/600
blues: 105/163
childrens: 58/126
classical: 92/337
country: 111/213
dance: 345/600
disney: 56/119
electronic: 340/600
experimental: 143/391
folk: 152/315
hip-hop-rap: 300/569
holiday: 37/58
house: 266/409
indie: 231/600
j-pop: 176/475
jazz: 176/405
k-pop: 120/191
latin: 55/292
metal: 82/141
pop: 300/600
punk: 73/145
r-and-b: 289/544
reggae: 49/80
rock: 334/600
singer-songwriter: 163/333
soul: 152/280
soundtrack: 202/600
techno: 52/107
video-game: 145/600
vocal: 47/97
world: 31/101
worship: 25/67
