In [11]:
from bs4 import BeautifulSoup
import urllib
import re
import json
from pyRealParser import Tune
import pandas as pd
from collections import Counter
from pychord import Chord
import ast

def get_song_urls(web_links):
    songs_urls = []
    for link in web_links:
        fp = urllib.request.urlopen(link)
        mystr = fp.read().decode("latin-1")
        soup = BeautifulSoup(mystr)
        for link in soup.findAll('a', attrs={'href':
                                             re.compile("^irealb://")}):
            songs_urls.append(link.get('href'))
    return songs_urls

def extract_chords_from_measure(measure):
    elements = re.findall(
        r"([A-G][#b]?)(11|7b13sus|13sus|9sus|7susadd3|7b9sus|13#9|13b9|13#11|13|7alt|7b9b13|7b9#9|7b9#5|7b9b5|7b9#11|7#9#11|7#9b5|7#9#5|7b13|9#5|9b5|9#11|7#5|7b5|7#11|7#9|7b9|9|-#5|-b6|h9|-7b5|-11|-9|-\^9|-\^7|-69|-6|\^7#5|\^9#11|\^7#11|69|6|\^13|\^9|o7|h7|7sus|7|-7|\^7|-|\^|sus|h|o|\+|add9|2|5?)?(/[A-G][#b]?)?",
        measure)
    chords = []
    for (root, extension, bass) in elements:
#         chords.append({'Root': root, 'Ext': extension, 'Bass': bass[1:]})
        chords.append(root+extension+bass)
    return chords


def string_to_chords(string):
    chords = [{
        'Root': chord.split(',')[0][2:-1],
        'Ext': chord.split(',')[1][2:-1],
        'Bass': chord.split(',')[2][2:-2]
    } for chord in string.split(";")]
    return chords


def chords_to_string(chords):
    string = ';'.join(json.dumps(list(d.values())) for d in chords)
    return string


def extract_meta_data(songs_urls):
    songs_meta = []
    for song_url in songs_urls:
        if len(Tune.parse_ireal_url(song_url))>0:
            my_tune = Tune.parse_ireal_url(song_url)[0]
            song_meta = {}
            song_meta["title"] = my_tune.title
            song_meta["composer"] = my_tune.composer
            song_meta["style"] = my_tune.style
            song_meta["key"] = my_tune.key
            song_meta["transpose"] = my_tune.transpose
            song_meta["comp_style"] = my_tune.comp_style
            song_meta["bpm"] = my_tune.bpm
            song_meta["repeats"] = my_tune.repeats
            song_meta["time_signature"] = my_tune.time_signature
            songs_meta.append(song_meta)
            chords = []
            for measure in my_tune.measures_as_strings:
                chords += extract_chords_from_measure(measure)
            notes_numbers = []
            for c in chords:
                notes_numbers.append(Chord(c).components(visible=False))
            # We are assuming that each note is interval from C
            song_meta["notes_numbers"] = notes_numbers
    return pd.DataFrame(songs_meta)

In [3]:
web_links = [
    "https://www.irealb.com/forums/showthread.php?22620-Suggestions-for-additions-or-changes-to-the-Main-Jazz-Playlist",
    "https://www.irealb.com/forums/showthread.php?4522-Jazz-1350-Standards-Individual-Songs",
    "https://www.irealb.com/forums/showthread.php?10591-Dixieland-Trad-Playlists",
    "https://www.irealb.com/forums/showthread.php?210-Fusion-and-Smooth-Jazz",
    "https://www.irealb.com/forums/showthread.php?204-Contemporary-Jazz",
    "https://www.irealb.com/forums/showthread.php?209-Pat-Metheny-songs",
    "https://www.irealb.com/forums/showthread.php?215-Gypsy-Jazz"
]
songs_urls = get_song_urls(web_links)

In [10]:
df = extract_meta_data(songs_urls)

Parsed Trane's Blues (Vierd Blues)
Parsed Trane's Blues (Vierd Blues)


AttributeError: 'dict' object has no attribute 'at'

In [12]:
print("repetitions: ", len([i for i in Counter(df["title"]).most_common() if i[1]>1]))
print("total: ", df.shape[0])
print("unique: ", len(Counter(df["title"]).most_common()))
df.drop_duplicates(subset="title", inplace=True)
print("repetitions: ", len([i for i in Counter(df["title"]).most_common() if i[1]>1]))
print("total: ", df.shape[0])
print("unique: ", len(Counter(df["title"]).most_common()))

repetitions:  0
total:  2137
unique:  2137
repetitions:  0
total:  2137
unique:  2137


In [13]:
df.head()

Unnamed: 0,title,composer,style,key,transpose,comp_style,bpm,repeats,time_signature,notes_numbers
0,Trane's Blues (Vierd Blues),Davis Miles,Medium Swing,Bb,,150,0,,"(4, 4)","[[10, 14, 17, 20], [3, 7, 10, 13], [10, 14, 17..."
1,Fall,Shorter Wayne,Ballad,E-,,Jazz-Ballad Swing,82,3.0,"(4, 4)","[[1, 6, 11, 13, 16, 22], [11, 15, 18, 21, 24, ..."
2,Blue Train,Coltrane John,Medium Up Swing,Eb,,Jazz-Medium Up Swing,154,8.0,"(4, 4)","[[3, 7, 10, 13, 18], [3, 7, 10, 13, 18], [3, 7..."
3,Cousin Mary,Coltrane John,Up Tempo Swing,Ab,,Jazz-Up Tempo Swing,218,3.0,"(4, 4)","[[8, 12, 15, 18], [3, 7, 10, 13, 18], [8, 12, ..."
4,Beauty And The Beast (Wayne Shorter),Shorter Wayne,Funk Jazz,F,,Jazz-Even 8ths,140,1.0,"(4, 4)","[[5, 10, 14, 17], [5, 10, 14, 17], [5, 10, 14,..."


In [14]:
df.to_csv("songs_and_chords.csv", index=False)

In [25]:
ast.literal_eval(pd.read_csv("songs_and_chords.csv")["notes_numbers"][0])

[[10, 14, 17, 20],
 [3, 7, 10, 13],
 [10, 14, 17, 20],
 [10, 14, 17, 20],
 [3, 7, 10, 13],
 [3, 7, 10, 13],
 [10, 14, 17, 20],
 [2, 5, 9, 12],
 [7, 11, 14, 17],
 [0, 3, 7, 10],
 [5, 9, 12, 15],
 [10, 14, 17, 20],
 [7, 11, 14, 17],
 [0, 3, 7, 10],
 [5, 9, 12, 15]]