# Looking at overall song structure

In this notebook, I'm going to look at song structure (intro -> verse -> chorus that kinda thing)

In [1]:
import re
from itertools import chain
import json
import os
from itertools import chain
import pandas as pd
import music_functions as mf

## Collecting data

First, we'll read in the structure dictionaries created after parsing the results. 

Next, `get_structure_dict` will create a dictionary identifying "next steps" in the structure progression; i.e., given the current section, what type of sections do songs tend to go to next? 

Then, `clean_structure_dict` will remove any section titles that only occur less than 10 times (meaning it's probably not a "standard" region and is specific to one song).

In [5]:
input_dir = "/Volumes/SECONDDRIVE/prog/ug/chord_dicts/2000/4/"
structure_dict = mf.clean_structure_dict(mf.get_structure_dict([input_dir + x for x in os.listdir(input_dir)]), 10)

In [6]:
print(structure_dict)

{'StartOfSong': {'intro': 130, 'verse': 52, 'pre-chorus': 1, 'chorus': 1}, 'intro': {'verse': 127, 'chorus': 5, 'pre-chorus': 1}, 'verse': {'chorus': 330, 'outro': 5, 'interlude': 7, 'verse': 41, 'pre-chorus': 53, 'EndOfSong': 12, 'instrumental': 11, 'bridge': 20, 'solo': 2}, 'chorus': {'interlude': 28, 'bridge': 77, 'verse': 148, 'EndOfSong': 59, 'outro': 70, 'pre-chorus': 3, 'instrumental': 54, 'solo': 24, 'chorus': 9}, 'interlude': {'verse': 21, 'bridge': 5, 'chorus': 9, 'solo': 3, 'outro': 2, 'EndOfSong': 2, 'instrumental': 1}, 'bridge': {'outro': 11, 'verse': 29, 'chorus': 52, 'EndOfSong': 5, 'interlude': 4, 'pre-chorus': 5, 'instrumental': 8, 'solo': 4, 'bridge': 1}, 'outro': {'EndOfSong': 94, 'outro': 2}, 'solo': {'verse': 8, 'chorus': 12, 'outro': 3, 'bridge': 5, 'EndOfSong': 2, 'pre-chorus': 1, 'instrumental': 1, 'interlude': 1}, 'pre-chorus': {'chorus': 56, 'verse': 5, 'instrumental': 1}, 'instrumental': {'chorus': 15, 'verse': 38, 'outro': 4, 'bridge': 12, 'solo': 1, 'interl

## Getting transition probability matrix

In [7]:
structure_df = pd.DataFrame.from_dict(structure_dict).fillna(0)

In [8]:
structure_df

Unnamed: 0,StartOfSong,intro,verse,chorus,interlude,bridge,outro,solo,pre-chorus,instrumental
intro,130.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
verse,52.0,127.0,41.0,148.0,21.0,29.0,0.0,8.0,5.0,38.0
pre-chorus,1.0,1.0,53.0,3.0,0.0,5.0,0.0,1.0,0.0,0.0
chorus,1.0,5.0,330.0,9.0,9.0,52.0,0.0,12.0,56.0,15.0
outro,0.0,0.0,5.0,70.0,2.0,11.0,2.0,3.0,0.0,4.0
interlude,0.0,0.0,7.0,28.0,0.0,4.0,0.0,1.0,0.0,2.0
EndOfSong,0.0,0.0,12.0,59.0,2.0,5.0,94.0,2.0,0.0,0.0
instrumental,0.0,0.0,11.0,54.0,1.0,8.0,0.0,1.0,1.0,1.0
bridge,0.0,0.0,20.0,77.0,5.0,1.0,0.0,5.0,0.0,12.0
solo,0.0,0.0,2.0,24.0,3.0,4.0,0.0,0.0,0.0,1.0


In [9]:
for col in structure_df.columns:
    structure_df[col] = structure_df[col]/sum(structure_df[col])

In [10]:
structure_df

Unnamed: 0,StartOfSong,intro,verse,chorus,interlude,bridge,outro,solo,pre-chorus,instrumental
intro,0.706522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
verse,0.282609,0.954887,0.085239,0.313559,0.488372,0.243697,0.0,0.242424,0.080645,0.520548
pre-chorus,0.005435,0.007519,0.110187,0.006356,0.0,0.042017,0.0,0.030303,0.0,0.0
chorus,0.005435,0.037594,0.686071,0.019068,0.209302,0.436975,0.0,0.363636,0.903226,0.205479
outro,0.0,0.0,0.010395,0.148305,0.046512,0.092437,0.020833,0.090909,0.0,0.054795
interlude,0.0,0.0,0.014553,0.059322,0.0,0.033613,0.0,0.030303,0.0,0.027397
EndOfSong,0.0,0.0,0.024948,0.125,0.046512,0.042017,0.979167,0.060606,0.0,0.0
instrumental,0.0,0.0,0.022869,0.114407,0.023256,0.067227,0.0,0.030303,0.016129,0.013699
bridge,0.0,0.0,0.04158,0.163136,0.116279,0.008403,0.0,0.151515,0.0,0.164384
solo,0.0,0.0,0.004158,0.050847,0.069767,0.033613,0.0,0.0,0.0,0.013699


In [11]:
structure_df = structure_df.transpose()

In [12]:
structure_df

Unnamed: 0,intro,verse,pre-chorus,chorus,outro,interlude,EndOfSong,instrumental,bridge,solo
StartOfSong,0.706522,0.282609,0.005435,0.005435,0.0,0.0,0.0,0.0,0.0,0.0
intro,0.0,0.954887,0.007519,0.037594,0.0,0.0,0.0,0.0,0.0,0.0
verse,0.0,0.085239,0.110187,0.686071,0.010395,0.014553,0.024948,0.022869,0.04158,0.004158
chorus,0.0,0.313559,0.006356,0.019068,0.148305,0.059322,0.125,0.114407,0.163136,0.050847
interlude,0.0,0.488372,0.0,0.209302,0.046512,0.0,0.046512,0.023256,0.116279,0.069767
bridge,0.0,0.243697,0.042017,0.436975,0.092437,0.033613,0.042017,0.067227,0.008403,0.033613
outro,0.0,0.0,0.0,0.0,0.020833,0.0,0.979167,0.0,0.0,0.0
solo,0.0,0.242424,0.030303,0.363636,0.090909,0.030303,0.060606,0.030303,0.151515,0.0
pre-chorus,0.0,0.080645,0.0,0.903226,0.0,0.0,0.0,0.016129,0.0,0.0
instrumental,0.0,0.520548,0.0,0.205479,0.054795,0.027397,0.0,0.013699,0.164384,0.013699


In [13]:
structure_df.to_csv("song_structure.csv")

Now, let's take a look at the distribution of the sections

In [14]:
#TODO: get distribution of the number of sections
dist_stats = mf.get_count_stats([input_dir + x for x in os.listdir(input_dir)])

In [15]:
dist_stats_df = pd.DataFrame.from_dict(dist_stats).fillna(0)

In [29]:
dist_stats_df['Song'] = [x.split('.')[0] for x in os.listdir(input_dir)]

In [30]:
dist_stats_df

Unnamed: 0,total_num_sections,num_unique_sections,Song
0,9,6,coldplay_yellow
1,12,7,jason-mraz_im-yours
2,8,4,plain-white-ts_hey-there-delilah
3,17,6,green-day_boulevard-of-broken-dreams
4,8,5,taylor-swift_love-story
...,...,...,...
190,7,4,green-day_last-night-on-earth
191,5,4,radiohead_nude
192,10,6,muse_hysteria
193,6,3,sunrise-avenue_fairytale-gone-bad


In [26]:
dist_stats_df.to_csv("distribution_stats.csv")

## Generating this data for many directories

In [31]:
base_dir = "/Volumes/SECONDDRIVE/prog/ug/chord_dicts/"
output_dir = "SongStructureOutput/"

decades = ['1970', '1980', '1990', '2000', '2010']
genres = ['4']

for decade in decades:
    for genre in genres:
        specific_dir = base_dir + decade + "/" + genre + "/"
        structure_dict = mf.clean_structure_dict(mf.get_structure_dict([specific_dir + x for x in os.listdir(specific_dir)]), 10)
        
        structure_df = pd.DataFrame.from_dict(structure_dict).fillna(0)
        for col in structure_df.columns:
            structure_df[col] = structure_df[col]/sum(structure_df[col])
                
        structure_df = structure_df.transpose()
        
        structure_df.to_csv(output_dir + decade + "_" + genre + "_" + "song_structure.csv")
        
        dist_stats = mf.get_count_stats([specific_dir + x for x in os.listdir(specific_dir)])
        dist_stats_df = pd.DataFrame.from_dict(dist_stats).fillna(0)
        dist_stats_df['Song'] = [x.split('.')[0] for x in os.listdir(specific_dir)]    

        dist_stats_df.to_csv(output_dir + decade + "_" + genre + "_" +"distribution_stats.csv")