# Reading and parsing results

In [50]:
import requests
from bs4 import BeautifulSoup
import re
from itertools import chain
import json
from collections import Counter
import os
import random

In [51]:
input_dir = "/Volumes/SECONDDRIVE/prog/ug/1970/4/"
output_dir = "/Volumes/SECONDDRIVE/prog/ug/chord_dicts/1970/"
filelist = os.listdir(input_dir)

In [52]:
files = [input_dir + x for x in random.choices(filelist, k=10)]
print(files)

['/Volumes/SECONDDRIVE/prog/ug/1970/4/acdc_tnt.txt', '/Volumes/SECONDDRIVE/prog/ug/1970/4/elton-john_rocket-man.txt', '/Volumes/SECONDDRIVE/prog/ug/1970/4/aerosmith_dream-on.txt', '/Volumes/SECONDDRIVE/prog/ug/1970/4/elton-john_goodbye-yellow-brick-road.txt', '/Volumes/SECONDDRIVE/prog/ug/1970/4/queen_we-are-the-champions.txt', '/Volumes/SECONDDRIVE/prog/ug/1970/4/neil-diamond_sweet-caroline.txt', '/Volumes/SECONDDRIVE/prog/ug/1970/4/crosby-stills-nash-young_our-house.txt', '/Volumes/SECONDDRIVE/prog/ug/1970/4/the-rolling-stones_angie.txt', '/Volumes/SECONDDRIVE/prog/ug/1970/4/david-bowie_rock-n-roll-suicide.txt', '/Volumes/SECONDDRIVE/prog/ug/1970/4/carole-king_will-you-love-me-tomorrow.txt']


## Functions

In [53]:
# get only the letter and if it's major/minor
def clean_chord(chord):
    # remove bass note notation
    if "/" in chord:
        chord = chord.split("/")[0]
        
    #TODO what if it's capitals???? 
    # remove any adds
    if "add" in chord:
        chord = chord.split("add")[0]
        
    # remove any sus
    if "sus" in chord:
        chord = chord.split("sus")[0]
        
    # remove numbers
    chord = ''.join([i for i in chord if not i.isdigit()])
    
    return(chord)
    
# later on i might want this info
def clean_chord_complex(chord):
    chord_dict = {}
    
    chord_dict['base chord'] = ''.join([i for i in chord.split("/")[0].split("add")[0].split("sus")[0] if not i.isdigit()])
    
    if "/" in chord:
        chord_dict['bass note'] = chord.split("/")[1]
        
    # remove any adds
    if "add" in chord:
        chord_dict['add'] = chord.split("add")[1]
        
    # remove any sus
    if "sus" in chord:
        chord_dict['sus'] = chord.split("sus")[1]
        
    #TODO get like 6/7 they sometimes don't have add before them
    return(chord_dict)

# get a dict of all the chords in the song and their occurrences
def get_all_chords(chord_progression):
    chords = [chord_progression[x]['chords'][1:-1] for x in chord_progression]
    chords = list(chain.from_iterable(chords))
    return (Counter(chords))

#TODO: eventually need to include logic for harmonic/melodic minor
# find the best-fit tonic 
def infer_tonic(chord_dict):
    # all chords in the song
    song_chords = [x for x in chord_dict]
    
    # possible chords
    all_chords = {}
    
    # create dictionary mapping each chord to the number of unique chords in the song that are in that key
    for chord in chord_dict:
        # diminished and augmented chords are not tonics
        if 'dim' in chord or 'aug' in chord:
            continue
        songs_in_key = sum([1 for x in song_chords if x in get_chords(chord)])
        all_chords[chord] = songs_in_key
    
    # how many chords are in the best fit key(s)?
#     max_value = max(all_chords.items(), key=lambda x : x[1])
    max_value = max([all_chords[key] for key in all_chords])
    
    # get list of possible keys that match this value
    best_fit_chords = [key for key in all_chords if all_chords[key] == max_value]
    
    # if there's only one, print that:
    if len(best_fit_chords) == 1:
        return best_fit_chords[0]
    else:
        # right now implementing logic to return the major over the minor since *most* popular songs are in a major key
        # TODO: in the future it might make more sense to use the most commonly occuring chord or something similar
        non_minor_fits = [key for key in best_fit_chords if "m" not in key]
        
        if len(non_minor_fits) > 0:
            # if there are one or more possible non-minor fits, pick the first one
            return non_minor_fits[0]
        else:
            # if there are no fits that aren't minor, just pick the first one
            return best_fit_chords[0]

def apply_capo(orig_key, capo):
    # strip major/minor marking at end
    note = orig_key.split("m")[0]
    
    append = ""
    if orig_key.endswith("m") or "inor" in orig_key:
        append = "m"
    

    if ('#' in note):
        orig_loc = notes_sharp.index(note) 
        return(notes_sharp[(orig_loc + capo)%12] + append)
    elif ('b' in note):
        orig_loc = notes_flat.index(note)
        return(notes_flat[(orig_loc + capo)%12] + append)
    else: #todo eventually we should choose flat vs. sharp based on what's in the tabs 
        orig_loc = notes_sharp.index(note)
        return(notes_sharp[(orig_loc + capo)%12] + append)
    
def get_notes(key):
    idx = major_scale
    notes = notes_sharp
    
    if key.endswith("m"):
        idx = minor_scale
    
    if 'b' in key:
        notes = notes_flat
     
    loc = notes.index(key.split("m")[0])
    return [notes[(i+loc)%12] for i in idx]


def get_chords(key):
    notes = get_notes(key)
    
    #TODO: currently just works with natural minor, edit to include chords that are in harmonic & melodic minors
    if key.endswith("m"):
        notes[0] = notes[0]+"m"
        notes[1] = notes[1]+"dim"
        notes[3] = notes[3]+"m"
        notes[4] = notes[4]+"m"
    else:
        notes[1] = notes[1]+"m"
        notes[2] = notes[2]+"m"
        notes[5] = notes[5]+"m"
        notes[6] = notes[6]+"dim"
    
    return(notes)

def get_relation(key, chord):
    chords = get_chords(key)
    
    if chord in chords:
        if key.endswith("m"):
            return numerals_minor[chords.index(chord)]
        else:
            return numerals_major[chords.index(chord)]
    else: #TODO implement
        return "Not In Key Signature"
        # if the root note is in the key then this won't be too hard
        # i.e. if we're in C then E would be III
        # but like...... what if it's Eb???? idk.

## Analysis

In [54]:
notes_sharp = ["A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"]
notes_flat = ["A", "Bb", "B", "C", "Db", "D", "Eb", "E", "F", "Gb", "G", "Ab"]

major_scale = [0,2,4,5,7,9,11]
minor_scale = [0,2,3,5,7,8,10]

numerals_major = ["I", "ii", "iii", "IV", "V", "vi", "vii°"]
numerals_minor = ["i", "ii°", "II", "iv", "v", "VI", "VII"]

In [69]:
# fname = "songs/pixies_where-is-my-mind.txt"
# fname = "songs/the-police_every-breath-you-take.txt"

for fname in files:    
    # check to see if we've already worked with this before
    print(fname)
    outname = output_dir + fname.split(".")[0].split("/")[-1] + ".json"
    if os.path.isfile(outname):
        print("file has already been analyzed")
        continue
    
    with open(fname, 'r') as file:
        data = file.read().replace('\n', '')

    soup = BeautifulSoup(data, "html.parser")
    
    results = soup.find_all("div", class_="_3naMH")    
    
    # Identify the key signature and capo
    key_search = [x.get_text().split()[1] for x in results if "Key" in x.get_text()]

    key_signature = 0
    if (len(key_search) > 0):
        key_signature = key_search[0]
        
    # Identify the guitar tuning (which can impact effect of the capo)
    tuning_search = [ ' '.join(x.get_text().split()[1:]) for x in results if "Tuning" in x.get_text()]
    print(tuning_search)
    
    tuning_offset = 0
    
    # there shouldn't be a scenario where tuning search has more than one element
    # either there's 1 (the tuning) or 0 (no tuning specified - assume standard)
    if (len(tuning_search) == 1):
        tuning = tuning_search[0]
        
        # ignore standard tuning EADGBE and drop D DADGBE because the offset will still be zero
        if tuning == 'E A D G B E' or tuning == 'D A D G B E':
            tuning_offset = 0
        # half-step down tuning
        elif tuning == 'Eb Ab Db Gb Bb Eb' or tuning == 'D# G# C# F# A# D#':
            tuning_offset = 1
        elif tuning == 'C G C F A D' or tuning == 'D G C F A D': # d or drop c tuning
            tuning_offset = 2
        else: # other tunings not currently supported... could return later
            print("This was a weird tuning")
            print(tuning)
            continue
            
    capo_search = [x.get_text().split()[1] for x in results if "Capo" in x.get_text()]
    capo = 0
    if (len(capo_search) > 0):
        # regex is used to remove any letters (so changing "3rd" to just "3")
        capo = int(re.sub("[^0-9]", "", capo_search[0]))

        
    # a capo transposes up, and the tuning generally transposes down
    transpose_offset = capo - tuning_offset
    
    # identify the locations of different sections
    song_spans = soup.find_all("pre", class_="_3F2CP _3hukP")[0].find_all("span")

    # identify where the section headers are
    section_header_match = [bool(re.match("<span class=\"_3rlxz\">\[.*\]</span>", str(song_spans[x]))) for x in range(len(song_spans))]
    section_header_idx = [i for i, val in enumerate(section_header_match) if val]
    section_headers = [song_spans[x].get_text() for x in section_header_idx]
    section_header_idx.append(len(song_spans))

    # get chord progression for each section
    chord_progression = {}

    for init_index in range(len(section_header_idx)-1):
        section = song_spans[section_header_idx[init_index]: section_header_idx[init_index+1]]
        section_chords = [x.get_text().strip().split() for x in section if str(x).startswith('<span class="_3rlxz">') and '_3PpPJ OrSDI' in str(x)]

        # removing the equivalencies that some people insert re: capo 
        section_chords = [x for x in section_chords if '=' not in x]
        
        # un-nest list
        section_chords = [clean_chord(x) for x in list(chain.from_iterable(section_chords))]
        
        # remove N.C.
#         section_chords = [x for x in section_chords if x not in ['N.C.', '...', "|", "[", "]", "(", ")", ""]]
        
        # remove any special characters (this might remove some info but that's ok)
        special_characters = ['*', '~']
        section_chords = [''.join([e for e in x if e not in special_characters]) for x in section_chords]
        
        # remove anything else that doesn't match
        section_chords = [x for x in section_chords if re.match(r"^[ABCDEFG][#b]*[ABCDEFG#bm]*$", x)]
        
        chord_progression[init_index] = { 'type' : section_headers[init_index], 'chords' : section_chords}
        
    # if the chord_progression dictionary is empty, then skip this step
    # this might happen because the song doesn't have labels for different parts of the song (intro, verse, chorus, etc.)
    if not chord_progression:
        print("Chord progression dictionary is empty")
        continue
    
    print(chord_progression)
    
    #TODO so in all cases it's not technically the most occurring (ex: fluorescent adolescent)
    # this could be a good enough approximation tho i guess lol
    if key_signature == 0:
        key_signature = infer_tonic(get_all_chords(chord_progression))
    else:
        key_signature = apply_capo(key_signature, transpose_offset)
    
    # convert to numerics for easier comparison
    for section in chord_progression:
#         print(key_signature)
        chords = chord_progression[section]['chords']
        chord_progression[section]['chords_numeric'] = [get_relation(key_signature, x) for x in chords]

    
    with open(outname, 'w') as outfile:
        json.dump(chord_progression, outfile)

/Volumes/SECONDDRIVE/prog/ug/1970/4/acdc_tnt.txt
['E A D G B E']
{0: {'type': '[Intro]', 'chords': ['E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A']}, 1: {'type': '[Verse]', 'chords': ['G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A']}, 2: {'type': '[Chorus]', 'chords': ['A', 'G', 'E', 'A', 'G', 'E', 'A', 'G', 'E', 'A', 'G', 'E', 'G', 'A']}, 3: {'type': '[Instrumental]', 'chords': ['E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A']}, 4: {'type': '[Verse]', 'chords': ['G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A', 'G', 'E', 'G', 'A']}, 5: {'type': '[Chorus]', 'chords': ['A', 'G', 'E', 'A', 'G', 'E', 'A', 'G', 'E', 'A', 'G', 'E', 'G', 'A']}, 6: {'type': '[Outro]', 'chords': ['A', 'G', 'E', 'A', 'G', 'E', 'A', 'G', 'E', 'A', 'G', 'E', 'A', 