In [4]:
from bs4 import BeautifulSoup
import urllib
import re
import json
from pyRealParser import Tune
import pandas as pd
from collections import Counter
from pychord import Chord


def get_song_urls(web_links):
    songs_urls = []
    for link in web_links:
        fp = urllib.request.urlopen(link)
        mystr = fp.read().decode("latin-1")
        soup = BeautifulSoup(mystr)
        for link in soup.findAll("a", attrs={"href": re.compile("^irealb://")}):
            songs_urls.append(link.get("href"))
    return songs_urls


def string_to_chords(string):
    chords = [
        {
            "Root": chord.split(",")[0][2:-1],
            "Ext": chord.split(",")[1][2:-1],
            "Bass": chord.split(",")[2][2:-2],
        }
        for chord in string.split(";")
    ]
    return chords


def chords_to_string(chords):
    string = ";".join(json.dumps(list(d.values())) for d in chords)
    return string


def encode_chords(chords):
    chords = [[note % 12 for note in chord] for chord in chords]
    encoded_chords = []
    for chord in chords:
        encoded_chord = [0] * 12
        for i in chord:
            encoded_chord[i] = 1
        encoded_chords.append(encoded_chord)
    return encoded_chords


def get_components(chords):
    notes_numbers = []
    for c in chords:
        notes_numbers.append(Chord(c).components(visible=False))
    return notes_numbers


def extract_chords_from_tune(my_tune):
    chords = []
    for measure in my_tune.measures_as_strings:
        elements = re.findall(
            r"([A-G][#b]?)(11|7b13sus|13sus|9sus|7susadd3|7b9sus|13#9|13b9|13#11|13|7alt|7b9b13|7b9#9|7b9#5|7b9b5|7b9#11|7#9#11|7#9b5|7#9#5|7b13|9#5|9b5|9#11|7#5|7b5|7#11|7#9|7b9|9|-#5|-b6|h9|-7b5|-11|-9|-\^9|-\^7|-69|-6|\^7#5|\^9#11|\^7#11|69|6|\^13|\^9|o7|h7|7sus|7|-7|\^7|-|\^|sus|h|o|\+|add9|2|5?)?(/[A-G][#b]?)?",
            measure,
        )
        for (root, extension, bass) in elements:
            #         chords.append({'Root': root, 'Ext': extension, 'Bass': bass[1:]})
            chords.append(root + extension + bass)
    return chords


def extract_meta_data(songs_urls):
    songs_meta = []
    for song_url in songs_urls:
        if len(Tune.parse_ireal_url(song_url)) > 0:
            my_tune = Tune.parse_ireal_url(song_url)[0]
            chords = extract_chords_from_tune(my_tune)
            notes_numbers = get_components(chords)
            encoded_chords = encode_chords(notes_numbers)
            # We are assuming that each note is interval from C
            song_meta = {
                "title": my_tune.title,
                "composer": my_tune.composer,
                "style": my_tune.style,
                "key": my_tune.key,
                "transpose": my_tune.transpose,
                "comp_style": my_tune.comp_style,
                "bpm": my_tune.bpm,
                "repeats": my_tune.repeats,
                "time_signature": my_tune.time_signature,
                "encoded_chords": encoded_chords,
            }
            songs_meta.append(song_meta)
    return pd.DataFrame(songs_meta)

In [5]:
web_links = [
    "https://www.irealb.com/forums/showthread.php?22620-Suggestions-for-additions-or-changes-to-the-Main-Jazz-Playlist",
    "https://www.irealb.com/forums/showthread.php?4522-Jazz-1350-Standards-Individual-Songs",
    "https://www.irealb.com/forums/showthread.php?10591-Dixieland-Trad-Playlists",
    "https://www.irealb.com/forums/showthread.php?210-Fusion-and-Smooth-Jazz",
    "https://www.irealb.com/forums/showthread.php?204-Contemporary-Jazz",
    "https://www.irealb.com/forums/showthread.php?209-Pat-Metheny-songs",
    "https://www.irealb.com/forums/showthread.php?215-Gypsy-Jazz",
]
songs_urls = get_song_urls(web_links)

In [25]:
df = extract_meta_data(songs_urls)

Parsed Trane's Blues (Vierd Blues)
Parsed Trane's Blues (Vierd Blues)
Parsed Fall
Parsed Fall
Parsed Blue Train
Parsed Blue Train
Parsed Cousin Mary
Parsed Cousin Mary
Parsed Beauty And The Beast (Wayne Shorter)
Parsed Beauty And The Beast (Wayne Shorter)
Parsed Footprints
Parsed Footprints
Parsed I Love You For Sentimental Reasons
Parsed I Love You For Sentimental Reasons
Parsed Georgia On My Mind (Ray Charles version)
Parsed Georgia On My Mind (Ray Charles version)
Parsed Goodbye Pork Pie Hat
Parsed Goodbye Pork Pie Hat
Parsed 26-2
Parsed 26-2
Parsed 500 Miles High
Parsed 500 Miles High
Parsed 502 Blues
Parsed 502 Blues
Parsed 52nd Street Theme
Parsed 52nd Street Theme
Parsed 9.20 Special
Parsed 9.20 Special
Parsed A Ballad
Parsed A Ballad
Parsed A Beautiful Friendship
Parsed A Beautiful Friendship
Parsed A Blossom Fell
Parsed A Blossom Fell
Parsed A Certain Smile
Parsed A Certain Smile
Parsed A Child Is Born
Parsed A Child Is Born
Parsed A Felicidade
Parsed A Felicidade
Parsed A Fin

In [26]:
print("repetitions: ", len([i for i in Counter(df["title"]).most_common() if i[1]>1]))
print("total: ", df.shape[0])
print("unique: ", len(Counter(df["title"]).most_common()))
df.drop_duplicates(subset="title", inplace=True)
print("repetitions: ", len([i for i in Counter(df["title"]).most_common() if i[1]>1]))
print("total: ", df.shape[0])
print("unique: ", len(Counter(df["title"]).most_common()))

repetitions:  187
total:  2332
unique:  2137
repetitions:  0
total:  2137
unique:  2137


In [28]:
df.to_csv("songs_and_chords.csv", index=False, header=True)

In [29]:
df.head()

Unnamed: 0,title,composer,style,key,transpose,comp_style,bpm,repeats,time_signature,encoded_chords
0,Trane's Blues (Vierd Blues),Davis Miles,Medium Swing,Bb,,150,0,,"(4, 4)","[[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0], [0, 1, ..."
1,Fall,Shorter Wayne,Ballad,E-,,Jazz-Ballad Swing,82,3.0,"(4, 4)","[[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1], [1, 0, ..."
2,Blue Train,Coltrane John,Medium Up Swing,Eb,,Jazz-Medium Up Swing,154,8.0,"(4, 4)","[[0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0], [0, 1, ..."
3,Cousin Mary,Coltrane John,Up Tempo Swing,Ab,,Jazz-Up Tempo Swing,218,3.0,"(4, 4)","[[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0], [0, 1, ..."
4,Beauty And The Beast (Wayne Shorter),Shorter Wayne,Funk Jazz,F,,Jazz-Even 8ths,140,1.0,"(4, 4)","[[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0], [0, 0, ..."
