In [1]:
import pandas as pd
import scipy.stats
import numpy as np
data = pd.read_csv('full_data.csv')


Has variance in review scores been decreasing over time? By breaking the data into two groups for older music and more recent music, we can run a t-test on the standard deviations of each year:

In [11]:
annual_stds = data.groupby('publication_year').std().score
older_music = annual_stds[annual_stds.index <= 2007]
recent_music = annual_stds[annual_stds.index > 2007]
T, p = scipy.stats.ttest_ind(older_music, recent_music, equal_var = False)
print("T score: {:.2f}, P value: {:.5f}".format(T, p))


T score: 4.81, P value: 0.00031


These results indicate that yes, variance in reviews has been shrinking over time.

Visually, there did not appear to be relationships between the audio feature variables and review score. The variable used as an example was liveliness. Taking a mathematical approach instead of a visual one, we can calculate the Pearson's correlation coefficient between liveliness and review score.

In [3]:
data[['score', 'liveness_mean']].corr(method='pearson')

Unnamed: 0,score,liveness_mean
score,1.0,-0.002919
liveness_mean,-0.002919,1.0


The value of -0.0029 confirms our suspicion that there is no relationship between these two. 

Have albums been getting louder and more energetic over time?

In [4]:
older_music = data[data.release_year <= 2006].dropna()
recent_music = data[data.release_year > 2006].dropna()

energy_mean_T, energy_mean_P = scipy.stats.ttest_ind(older_music.energy_mean, recent_music.energy_mean, equal_var=False)
print("T score: {:.2f}, P value: {:.4f}".format(energy_mean_T, energy_mean_P))
energy_std_T, energy_std_P = scipy.stats.ttest_ind(older_music.energy_std, recent_music.energy_std, equal_var=False)
print("T score: {:.2f}, P value: {:.4f}".format(energy_std_T, energy_std_P))

loudness_mean_T, loudness_mean_P = scipy.stats.ttest_ind(older_music.loudness_mean, recent_music.loudness_mean, equal_var=False)
print("T score: {:.2f}, P value: {:.4f}".format(loudness_mean_T, loudness_mean_P))

loudness_std_T, loudness_std_P = scipy.stats.ttest_ind(older_music.loudness_std, recent_music.loudness_std, equal_var=False)
print("T score: {:.2f}, P value: {:.4f}".format(loudness_std_T, loudness_std_P))

T score: -7.45, P value: 0.0000
T score: 8.65, P value: 0.0000
T score: -7.21, P value: 0.0000
T score: 10.81, P value: 0.0000


The very large T-scores and correspondingly small P-values indicate that the trends we observed are undeniable. 

In [5]:
keys = pd.unique(data.primary_key)
key_data = {key:data['score'][data.primary_key == key] for key in keys}
F, p = scipy.stats.f_oneway(key_data['F#/Gb'], key_data['G'], key_data['C'], key_data['D'], key_data['A'], key_data['C#/Db'], key_data['A#/Bb'], key_data['B'], key_data['E'], key_data['F'], key_data['G#/A'], key_data['D#/Eb'])
print("F score: {:.2f}, P value: {:.4f}".format(F, p))

F score: 2.58, P value: 0.0028


These scores are testing the null hypothesis that the mean of every single group (in this case, key) is identical. From our visualization, it appeared as though the less common keys behaved differently from the more common keys. If we remove the less common keys (those with under 1000 albums) from the F test, our results change:

In [6]:
F, p = scipy.stats.f_oneway(key_data['G'], key_data['C'], key_data['D'], key_data['A'], key_data['C#/Db'], key_data['B'])
print("F score: {:.2f}, P value: {:.4f}".format(F, p))

F, p = scipy.stats.f_oneway(key_data['F#/Gb'], key_data['A#/Bb'],  key_data['E'], key_data['F'], key_data['G#/A'], key_data['D#/Eb'])
print("F score: {:.2f}, P value: {:.4f}".format(F, p))

F score: 1.08, P value: 0.3666
F score: 2.81, P value: 0.0156


This indicates that among the popular keys, there is no difference between keys - however, in the less common keys, differences do exist. Are the two groups (common and uncommon keys) different?

In [7]:
key_groups = data.groupby('primary_key').count().score
common_keys = key_groups[key_groups >= 1000]
uncommon_keys = key_groups[key_groups < 1000]

#data.loc[data.primary_key in common_keys]
common_key_data = data.loc[data.primary_key.isin(common_keys.index)]
uncommon_key_data = data.loc[data.primary_key.isin(uncommon_keys.index)]

T, p = scipy.stats.ttest_ind(common_key_data.score, uncommon_key_data.score)
print("Common key average score: {:.2f}".format(common_key_data.score.mean()))
print("Uncommon key average score: {:.2f}".format(uncommon_key_data.score.mean()))
print("T score: {:.2f}, P value: {:.4f}".format(T, p))

Common key average score: 6.92
Uncommon key average score: 7.01
T score: -3.15, P value: 0.0016


With a p-value of .001, we can reject the null hypothesis that albums written primarily in common keys and albums written primarily in uncommon keys receive the same scores.

From our visualization of album quality by genre, we would like to assign a numeric value to how much each individual genre impacts album score. Once again, F testing allows for comparisons of each genre of music, seeing if certain genres have an impact on quality category. The results of these tests can be interpreted as testing the null hypothesis that the genre does not impact if an album is top 10%, bottom 10%, or in the middle, versus the alternative hypothesis that genre does have an impact. 

In [8]:
masterpieces = data.loc[data.score==10]
print("Number of pefect 10s:", masterpieces.shape[0])

lower_quantile = data.score.quantile(.1)
flops = data.loc[data.score <= lower_quantile]
print("Number of albums at or below", lower_quantile, "-", flops.shape[0])

upper_quantile = data.score.quantile(.9)
classics = data.loc[data.score >= upper_quantile]
print("Number of albums at or above", upper_quantile, '-', classics.shape[0])

regular_albums = data.loc[(data.score > lower_quantile) & (data.score <= upper_quantile)]
print("Number of albums between the two cutoffs -", regular_albums.shape[0])

Number of pefect 10s: 15
Number of albums at or below 5.3 - 1292
Number of albums at or above 8.2 - 1488
Number of albums between the two cutoffs - 9830


In [9]:
F, p = scipy.stats.f_oneway(flops['experimental'], classics['experimental'], regular_albums['experimental'])
print("Experimental: F score: {:.2f}, P value: {:.4f}".format(F, p))

F, p = scipy.stats.f_oneway(flops['rock'], classics['rock'], regular_albums['rock'])
print("Rock: F score: {:.2f}, P value: {:.4f}".format(F, p))

F, p = scipy.stats.f_oneway(flops['pop/r&b'], classics['pop/r&b'], regular_albums['pop/r&b'])
print("Pop/R&B: F score: {:.2f}, P value: {:.4f}".format(F, p))

F, p = scipy.stats.f_oneway(flops['folk/country'], classics['folk/country'], regular_albums['folk/country'])
print("Folk/Country: F score: {:.2f}, P value: {:.4f}".format(F, p))

F, p = scipy.stats.f_oneway(flops['electronic'], classics['electronic'], regular_albums['electronic'])
print("Electronic: F score: {:.2f}, P value: {:.4f}".format(F, p))

F, p = scipy.stats.f_oneway(flops['rap'], classics['rap'], regular_albums['rap'])
print("Rap: F score: {:.2f}, P value: {:.4f}".format(F, p))

F, p = scipy.stats.f_oneway(flops['jazz'], classics['jazz'], regular_albums['jazz'])
print("Jazz: F score: {:.2f}, P value: {:.4f}".format(F, p))

F, p = scipy.stats.f_oneway(flops['metal'], classics['metal'], regular_albums['metal'])
print("Metal: F score: {:.2f}, P value: {:.4f}".format(F, p))




Experimental: F score: 39.04, P value: 0.0000
Rock: F score: 20.31, P value: 0.0000
Pop/R&B: F score: 6.63, P value: 0.0013
Folk/Country: F score: 5.83, P value: 0.0029
Electronic: F score: 5.01, P value: 0.0067
Rap: F score: 3.87, P value: 0.0209
Jazz: F score: 3.57, P value: 0.0281
Metal: F score: 0.96, P value: 0.3833


Our results indicate that some genres have a very considerable impact on quality, while others have a smaller but still significant impact, and that for one genre (metal), there is no impact. This confirms what we saw in the visualization - not all genres are rated equally.