# Looking at overall song structure

In this notebook, I'm going to look at song structure (intro -> verse -> chorus that kinda thing)

In [1]:
import re
from itertools import chain
import json
import os
from itertools import chain
import pandas as pd
import music_functions as mf

## Collecting data

First, we'll read in the structure dictionaries created after parsing the results. 

Next, `get_structure_dict` will create a dictionary identifying "next steps" in the structure progression; i.e., given the current section, what type of sections do songs tend to go to next? 

Then, `clean_structure_dict` will remove any section titles that only occur less than 10 times (meaning it's probably not a "standard" region and is specific to one song).

In [2]:
input_dir = "/Volumes/SECONDDRIVE/prog/ug/chord_dicts/2000/"
structure_dict = mf.clean_structure_dict(mf.get_structure_dict([input_dir + x for x in os.listdir(input_dir)]), 10)

In [3]:
print(structure_dict)

{'StartOfSong': {'chorus': 6, 'intro': 281, 'verse': 120, 'pre-chorus': 1}, 'chorus': {'verse': 349, 'bridge': 151, 'interlude': 56, 'solo': 35, 'outro': 134, 'chorus': 23, 'EndOfSong': 147, 'instrumental': 102, 'post-chorus': 11, 'pre-chorus': 4, 'hook': 1, 'refrain': 1}, 'verse': {'chorus': 679, 'interlude': 18, 'instrumental': 29, 'verse': 110, 'bridge': 37, 'outro': 15, 'EndOfSong': 28, 'pre-chorus': 117, 'solo': 7, 'hook': 9, 'refrain': 7}, 'intro': {'verse': 264, 'chorus': 19, 'pre-chorus': 2, 'EndOfSong': 1, 'refrain': 1, 'instrumental': 1}, 'bridge': {'interlude': 7, 'verse': 43, 'outro': 21, 'EndOfSong': 12, 'chorus': 108, 'solo': 10, 'instrumental': 10, 'pre-chorus': 5, 'hook': 1, 'bridge': 2, 'intro': 1}, 'interlude': {'EndOfSong': 3, 'verse': 43, 'chorus': 23, 'bridge': 8, 'solo': 3, 'instrumental': 2, 'outro': 6}, 'solo': {'pre-chorus': 1, 'outro': 5, 'verse': 14, 'chorus': 21, 'bridge': 7, 'EndOfSong': 5, 'instrumental': 1, 'interlude': 1}, 'pre-chorus': {'chorus': 119, '

## Getting transition probability matrix

In [4]:
structure_df = pd.DataFrame.from_dict(structure_dict).fillna(0)

In [5]:
structure_df

Unnamed: 0,StartOfSong,chorus,verse,intro,bridge,interlude,solo,pre-chorus,outro,instrumental,post-chorus,hook,refrain
chorus,6.0,23.0,679.0,19.0,108.0,23.0,21.0,119.0,0.0,33.0,2.0,0.0,7.0
intro,281.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
verse,120.0,349.0,110.0,264.0,43.0,43.0,14.0,6.0,0.0,75.0,3.0,8.0,2.0
pre-chorus,1.0,4.0,117.0,2.0,5.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
bridge,0.0,151.0,37.0,0.0,2.0,8.0,7.0,0.0,0.0,20.0,1.0,2.0,0.0
interlude,0.0,56.0,18.0,0.0,7.0,0.0,1.0,1.0,0.0,2.0,2.0,0.0,0.0
solo,0.0,35.0,7.0,0.0,10.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
outro,0.0,134.0,15.0,0.0,21.0,6.0,5.0,0.0,2.0,10.0,1.0,1.0,0.0
EndOfSong,0.0,147.0,28.0,1.0,12.0,3.0,5.0,1.0,193.0,0.0,1.0,2.0,0.0
instrumental,0.0,102.0,29.0,1.0,10.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


In [6]:
for col in structure_df.columns:
    structure_df[col] = structure_df[col]/sum(structure_df[col])

In [7]:
structure_df

Unnamed: 0,StartOfSong,chorus,verse,intro,bridge,interlude,solo,pre-chorus,outro,instrumental,post-chorus,hook,refrain
chorus,0.014706,0.022682,0.642992,0.065972,0.490909,0.261364,0.381818,0.908397,0.0,0.229167,0.2,0.0,0.636364
intro,0.688725,0.0,0.0,0.0,0.004545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
verse,0.294118,0.344181,0.104167,0.916667,0.195455,0.488636,0.254545,0.045802,0.0,0.520833,0.3,0.615385,0.181818
pre-chorus,0.002451,0.003945,0.110795,0.006944,0.022727,0.0,0.018182,0.0,0.0,0.006944,0.0,0.0,0.090909
bridge,0.0,0.148915,0.035038,0.0,0.009091,0.090909,0.127273,0.0,0.0,0.138889,0.1,0.153846,0.0
interlude,0.0,0.055227,0.017045,0.0,0.031818,0.0,0.018182,0.007634,0.0,0.013889,0.2,0.0,0.0
solo,0.0,0.034517,0.006629,0.0,0.045455,0.034091,0.0,0.0,0.0,0.006944,0.0,0.0,0.090909
outro,0.0,0.13215,0.014205,0.0,0.095455,0.068182,0.090909,0.0,0.010256,0.069444,0.1,0.076923,0.0
EndOfSong,0.0,0.14497,0.026515,0.003472,0.054545,0.034091,0.090909,0.007634,0.989744,0.0,0.1,0.153846,0.0
instrumental,0.0,0.100592,0.027462,0.003472,0.045455,0.022727,0.018182,0.007634,0.0,0.006944,0.0,0.0,0.0


In [8]:
structure_df = structure_df.transpose()

In [9]:
structure_df

Unnamed: 0,chorus,intro,verse,pre-chorus,bridge,interlude,solo,outro,EndOfSong,instrumental,post-chorus,hook,refrain
StartOfSong,0.014706,0.688725,0.294118,0.002451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chorus,0.022682,0.0,0.344181,0.003945,0.148915,0.055227,0.034517,0.13215,0.14497,0.100592,0.010848,0.000986,0.000986
verse,0.642992,0.0,0.104167,0.110795,0.035038,0.017045,0.006629,0.014205,0.026515,0.027462,0.0,0.008523,0.006629
intro,0.065972,0.0,0.916667,0.006944,0.0,0.0,0.0,0.0,0.003472,0.003472,0.0,0.0,0.003472
bridge,0.490909,0.004545,0.195455,0.022727,0.009091,0.031818,0.045455,0.095455,0.054545,0.045455,0.0,0.004545,0.0
interlude,0.261364,0.0,0.488636,0.0,0.090909,0.0,0.034091,0.068182,0.034091,0.022727,0.0,0.0,0.0
solo,0.381818,0.0,0.254545,0.018182,0.127273,0.018182,0.0,0.090909,0.090909,0.018182,0.0,0.0,0.0
pre-chorus,0.908397,0.0,0.045802,0.0,0.0,0.007634,0.0,0.0,0.007634,0.007634,0.0,0.007634,0.015267
outro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010256,0.989744,0.0,0.0,0.0,0.0
instrumental,0.229167,0.0,0.520833,0.006944,0.138889,0.013889,0.006944,0.069444,0.0,0.006944,0.0,0.0,0.006944


In [10]:
structure_df.to_csv("song_structure.csv")

Now, let's take a look at the distribution of the sections

In [11]:
#TODO: get distribution of the number of sections
dist_stats = mf.get_count_stats([input_dir + x for x in os.listdir(input_dir)])

In [12]:
dist_stats_df = pd.DataFrame.from_dict(dist_stats).fillna(0)

In [13]:
dist_stats_df['Song'] = [x.split('.')[0] for x in os.listdir(input_dir)]

In [14]:
dist_stats_df

Unnamed: 0,total_num_sections,num_unique_sections,Song
0,9,3,eminem_stan
1,7,5,3-doors-down_here-without-you
2,13,8,3-doors-down_kryptonite
3,8,5,a-day-to-remember_if-it-means-a-lot-to-you
4,6,3,all-time-low_dear-maria-count-me-in
...,...,...,...
425,7,3,randy-travis_just-a-closer-walk-with-thee
426,7,6,beirut_elephant-gun
427,9,4,b-o-b_nothin-on-you
428,11,6,my-chemical-romance_mama


In [15]:
dist_stats_df.to_csv("distribution_stats.csv")

## Generating this data for many directories

In [17]:
base_dir = "/Volumes/SECONDDRIVE/prog/ug/chord_dicts/"
output_dir = "Output/SongStructure/"

decades = ['1970', '1980', '1990', '2000', '2010']

for decade in decades:
    specific_dir = base_dir + decade + "/"
    structure_dict = mf.clean_structure_dict(mf.get_structure_dict([specific_dir + x for x in os.listdir(specific_dir)]), 10)

    structure_df = pd.DataFrame.from_dict(structure_dict).fillna(0)
    for col in structure_df.columns:
        structure_df[col] = structure_df[col]/sum(structure_df[col])

    structure_df = structure_df.transpose()

    structure_df.to_csv(output_dir + decade + "_" + "song_structure.csv")

    dist_stats = mf.get_count_stats([specific_dir + x for x in os.listdir(specific_dir)])
    dist_stats_df = pd.DataFrame.from_dict(dist_stats).fillna(0)
    dist_stats_df['Song'] = [x.split('.')[0] for x in os.listdir(specific_dir)]    

    dist_stats_df.to_csv(output_dir + decade  + "_" +"distribution_stats.csv")