# Parse Results

In this notebook, we will parse the raw data we saved in the last notebook. The output will be a dictionary (saved as a `.json` file) that contains info about song structure and chord progression.

In [11]:
import requests
from bs4 import BeautifulSoup
import re
from itertools import chain
import json
from collections import Counter
import os
import random
import music_functions as mf

In [12]:
input_dir = '/Volumes/SECONDDRIVE/prog/ug/raw_data/'
output_dir = '/Volumes/SECONDDRIVE/prog/ug/chord_dicts/'

In [13]:
decade_dirs = [f.path for f in os.scandir(input_dir) if f.is_dir() and f.path.split("/")[-1].isnumeric()]

In [14]:
if not os.path.exists(output_dir ):
    os.mkdir(output_dir )

# iterate over each decade
for decade_dir in decade_dirs:
    
    decade = decade_dir.split("/")[-1]
    if not os.path.exists(output_dir + decade):
        os.mkdir(output_dir + decade)
        
    filelist = [input_dir+ decade + "/" + x for x in os.listdir(input_dir+ decade + "/")]

    for fname in filelist:    

        # check to see if we've already parsed this data before
        outname = output_dir + decade + "/" + fname.split(".")[0].split("/")[-1] + ".json"
        if os.path.isfile(outname):
            continue

        with open(fname, 'r') as file:
            print(fname)
            data = file.read().replace('\n', '')

        # convert to beautiful soup object
        soup = BeautifulSoup(data, "html.parser")

        # _3naMH is the class for the "header" data - key, tuning, capo placement
        results = soup.find_all("div", class_="_3naMH")    

        # Identify the key signature and capo
        key_search = [x.get_text().split()[1] for x in results if "Key" in x.get_text()]

        key_signature = 0
#         if (len(key_search) > 0):
#             key_signature = key_search[0]

        # Identify the guitar tuning (which can impact effect of the capo)
        tuning_search = [ ' '.join(x.get_text().split()[1:]) for x in results if "Tuning" in x.get_text()]    
        tuning_offset = 0

        # there shouldn't be a scenario where tuning search has more than one element
        # either there's 1 (the tuning) or 0 (no tuning specified - assume standard)
        if (len(tuning_search) == 1):
            tuning = tuning_search[0]

            # ignore standard tuning EADGBE and drop D DADGBE because the offset will still be zero
            if tuning == 'E A D G B E' or tuning == 'D A D G B E':
                tuning_offset = 0
            # half-step down tuning
            elif tuning == 'Eb Ab Db Gb Bb Eb' or tuning == 'D# G# C# F# A# D#':
                tuning_offset = 1
            elif tuning == 'C G C F A D' or tuning == 'D G C F A D': # d or drop c tuning
                tuning_offset = 2
            else: # other tunings not currently supported... could return later
                print("This was a weird tuning")
                print(tuning)
                continue

        capo_search = [x.get_text().split()[1] for x in results if "Capo" in x.get_text()]
        capo = 0
        if (len(capo_search) > 0):
            # regex is used to remove any letters (so changing "3rd" to just "3")
            x = re.sub("[^0-9]", "", capo_search[0])
            if len(x) > 0:
                capo = int(x)


        # a capo transposes up, and the tuning generally transposes down
        transpose_offset = capo - tuning_offset

        # identify the locations of different sections
        try:
            song_spans = soup.find_all("pre", class_="_3F2CP _3hukP")[0].find_all("span")

                # identify where the section headers are
            section_header_match = [bool(re.match("<span class=\"_3rlxz\">\[.*\]</span>", str(song_spans[x]))) for x in range(len(song_spans))]
            section_header_idx = [i for i, val in enumerate(section_header_match) if val]
            section_headers = [song_spans[x].get_text() for x in section_header_idx]
            section_header_idx.append(len(song_spans))

            # get chord progression for each section
            chord_progression = {}

            for init_index in range(len(section_header_idx)-1):
                section = song_spans[section_header_idx[init_index]: section_header_idx[init_index+1]]
                section_chords = [x.get_text().strip().split() for x in section if str(x).startswith('<span class="_3rlxz">') and '_3PpPJ OrSDI' in str(x)]

                # removing the equivalencies that some people insert re: capo 
                section_chords = [x for x in section_chords if '=' not in x]

                # un-nest list
                section_chords = [mf.clean_chord(x) for x in list(chain.from_iterable(section_chords))]

                # remove any special characters (this might remove some info but that's ok)
                special_characters = ['*', '~']
                section_chords = [''.join([e for e in x if e not in special_characters]) for x in section_chords]

                # remove anything else that doesn't match
                section_chords = [x for x in section_chords if re.match(r"^[ABCDEFG][#b]*[ABCDEFG#bm]*$", x)]

                chord_progression[init_index] = { 'type' : section_headers[init_index], 'chords' : section_chords}

            # if the chord_progression dictionary is empty, then skip this step
            # this might happen because the song doesn't have labels for different parts of the song (intro, verse, chorus, etc.)
            if not chord_progression:
                print("Chord progression dictionary is empty")
                continue

            #TODO so in all cases it's not technically the most occurring (ex: fluorescent adolescent)
            # this could be a good enough approximation tho i guess lol
            if key_signature == 0:
                key_signature = mf.apply_capo(mf.infer_tonic(mf.get_all_chords(chord_progression)), transpose_offset)
            
            chord_progression["Tonic"] = key_signature
            
            # convert to numerics for easier comparison
            for section in chord_progression:
                if section is not "Tonic":
                    chord_progression[section]['chords'] = [mf.apply_capo(x,transpose_offset) for x in chord_progression[section]['chords']]
                    chord_progression[section]['chords_numeric'] = [mf.get_relation(key_signature, x) for x in chord_progression[section]['chords']]
            
            with open(outname, 'w') as outfile:
                json.dump(chord_progression, outfile)
        except:
            print("Error")
            continue

            

  if section is not "Tonic":


/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/john-lennon_happy-xmas-war-is-over.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/10cc_im-not-in-love.txt
Error
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/acdc_highway-to-hell.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/acdc_tnt.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/aerosmith_dream-on.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/alice-cooper_poison.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/america_a-horse-with-no-name.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/america_sister-golden-hair.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/america_ventura-highway.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/bee-gees_how-deep-is-your-love.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/bee-gees_more-than-a-woman.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/bette-midler_the-rose.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/billy-joel_just-the-way-you-are.txt
/Volumes/SECONDDRIVE/prog/ug/raw_data/1970/billy-joel_new-york-stat