In [1]:
import json
import requests
from bs4 import BeautifulSoup
import string
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error 
from wordcloud import WordCloud

In [6]:
TRACKS_FILE_NAME = 'top_hits.json'

In [7]:
with open('data/TRACKS_FILE_NAME') as json_file:
    tracks = json.load(json_file)

FileNotFoundError: [Errno 2] No such file or directory: 'data/TRACKS_FILE_NAME'

In [None]:
df = pd.DataFrame(tracks)

In [None]:
df.shape

In [None]:
df['year']=df['album'].apply(lambda x: x.get('release_date')[:4])
df['decade'] = df['year'].apply(lambda x: x[:3]+'0')

In [None]:
df['primary_artist'] = df['artists'].apply(lambda x: x[0]['name'])

In [None]:
df['artists'] = df['artists'].apply(lambda x: [artist['name'] for artist in x])

In [None]:
df.columns

### Distributions

In [None]:
ax = df[['acousticness', 'danceability', 'duration_ms', 'energy',
           'explicit', 'instrumentalness', 'liveness', 'loudness', 'mode',
           'popularity', 'speechiness', 'tempo', 'time_signature', 'valence']].\
            hist(figsize=(16,12), bins=20, alpha=0.75);
ax = ax.ravel()
plt.tight_layout()

titles = ['Acousticness', 'Danceability', 'Duration (miliseconds)', 'Energy',
          'Explicit', 'Instrumentalness', 'Liveness', 'Loudness (dB)', 'Mode',
          'Popularity', 'Speechiness', 'Tempo', 'Time Signature', 'Valence']

for i in range(len(ax)-2):
    ax[i].set_title(titles[i])
    if i % 4 == 0:
        ax[i].set_ylabel('Number of Songs')
plt.suptitle('Distributions of Spotify Audio Features', fontsize=20, y=1.05);
plt.show()

### Correlation heatmap 

In [None]:
num_df_attributes = df[['acousticness', 'danceability', 'duration_ms', 'energy',
           'instrumentalness', 'liveness', 'loudness', 'mode',
           'popularity', 'speechiness', 'tempo', 'time_signature', 'valence']]

num_df_attributes.corr('spearman')['popularity']

In [None]:
fig, ax = plt.subplots(figsize=(12,12))   
sns.heatmap(
    num_df_attributes.corr(), 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
    annot=True,
    ax = ax
)

In [None]:
ax = sns.regplot(x=df['energy'], y=df['loudness'], line_kws= {'linewidth': 1, 'color': 'red'}, scatter_kws={'s': 1})
ax.set(title='Correlation: 0.68')
ax.set(xlabel='Energy', ylabel='Loudness (dB)')
plt.show()

### Boxplots over decades

In [None]:
audio_features=['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness',
                'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

In [None]:
for feature in audio_features:
    fig, ax = plt.subplots(figsize=(8,8))
    plt.suptitle('')
    ax.grid(False)
    df.boxplot(column=[feature], by='decade', ax=ax, showfliers=False, grid=False)
    plt.show()

### Most populart artists

In [None]:
df['primary_artist'].value_counts().head(10)

In [None]:
ax = df['primary_artist'].value_counts().sort_values().tail(20).plot.barh(figsize=(12,7))
ax.set(xlabel='Number of Hits')
ax.set_title('Most Popular Artists', fontsize=15)

In [None]:
top_artists = df.groupby('primary_artist')['name'].count()
top_artists = top_artists[top_artists >= 5]
audio_features=['acousticness', 'danceability', 'energy','instrumentalness', 
                'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
X_artists = df.groupby('primary_artist')[audio_features].mean()
X_artists = X_artists.reindex(top_artists.index)

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(16,15))
ax = ax.ravel()

for i in range(len(audio_features)):
    # word-weight dictionary
    feature = X_artists[audio_features[i]] + 1e-3
    dictionary = dict(zip(X_artists.index, feature))
    
    wordcloud = WordCloud(background_color='white', height=1400, width=1400, min_font_size=5, max_font_size=300)
    wordcloud = wordcloud.generate_from_frequencies(dictionary)
    
    ax[i].imshow(wordcloud, interpolation='bilinear')
    ax[i].set_title(audio_features[i])
    ax[i].axis('off')