In [47]:
# libraries
import altair as alt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data Processing and Cleaning
Data Source: https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset

In [48]:
# read in data
full_df = pd.read_csv('kaggle.csv')

df = full_df.copy()

# remove number column
df = df.iloc[:, 1:]

# drop rows with null values
df = df.dropna()

In [49]:
# filter to only songs with a popularity score above 70
df_filtered = df[df['popularity'] >= 70]

# keep only genres that appear at least 10 times
genre_counts = df_filtered['track_genre'].value_counts()
to_keep = genre_counts[genre_counts >= 150].index

df_filtered = df_filtered[df_filtered['track_genre'].isin(to_keep)]

# list of popular genres
popular_genres = list(df_filtered['track_genre'].unique())

In [50]:
# custom colors from colorgorical: 
custom_colors = ["rgb(161,222,240)", "rgb(36,90,98)", "rgb(52,241,153)", "rgb(106,39,134)", "rgb(190,158,226)", 
                 "rgb(90,75,114)", "rgb(87,155,161)", "rgb(166,229,144)", "rgb(116,53,2)", "rgb(145,236,23)", 
                "rgb(215,55,113)", "rgb(64,164,59)", "rgb(239,102,240)", "rgb(9,96,19)"]

# map genres to colors
genre_color_map = dict(zip(popular_genres, custom_colors))

In [51]:
# shorter list of features (mainly ones on the same 0-1 scale)
short_features = ['popularity', 'danceability', 'energy', 'acousticness', 'instrumentalness', 'valence']

# longer list of features
long_features = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 
                 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# Figure: Popularity Score Distribution by Genre

In [52]:
# stores genre information
by_genre = pd.DataFrame(popular_genres, columns=['track_genre'])

# calculate top 3 songs for each genre - return as a formatted string
def top_3_songs(genre, df):
    songs = df[df['track_genre']==genre].sort_values(by='popularity', ascending=False).head(3)['track_name'].reset_index(drop=True)
    return '; '.join(songs)

# add top 3 songs to df
by_genre['top_3_songs'] = by_genre['track_genre'].apply(top_3_songs, args=(df_filtered,))

In [53]:
# box plot layer
box = alt.Chart(df_filtered).mark_boxplot().encode(
    x=alt.X('track_genre:N', sort='-y', title='Genre'),
    y=alt.Y('popularity:Q', title='Popularity Score',
        scale=alt.Scale(domain=[70, 100])),
    color=alt.Color('track_genre:N',
                    scale=alt.Scale(domain=list(genre_color_map.keys()), range=list(genre_color_map.values())),
                    legend=None),
)
# it is absolutely ridiculous that there is no way to disable this first tooltip. i don't know what to do

# invisible overlay for tooltip (shows top 3 songs)
tooltip_overlay = alt.Chart(df_filtered).transform_lookup(
    lookup='track_genre',
    from_=alt.LookupData(by_genre, key='track_genre', fields=['top_3_songs'])
).mark_circle(opacity=0).encode(
    x='track_genre:N',
    y='popularity:Q',
    tooltip=[
        alt.Tooltip('track_genre:N', title='Genre'),
        alt.Tooltip('top_3_songs:N', title='Top 3 Songs')
    ]
)

# combine layers
chart = (box + tooltip_overlay).properties(
    width=700,
    height=400,
    title='Popularity Score Distribution by Genre'
)

chart

# download as png and html
chart.save("popularity_distribution.html")  
chart.save("popularity_distribution.png")  

# download separate html for visualization
chart.save("plots/popularity_distribution.html")

# Figure: Feature Scatterplot by Genre

In [54]:
# dropdowns for the x and y axes
x_axis = alt.param(
    name='x_axis',
    bind=alt.binding_select(
        options=long_features, name='Feature 1: '
    ),
    value='loudness'
)

y_axis = alt.param(
    name='y_axis',
    bind=alt.binding_select(
        options=long_features, name='Feature 2: '
    ),
    value='energy'
)

# select from legend instead of points
genre_selection = alt.selection_point(
    fields=['track_genre'],
    bind='legend',
    name='Select Genre',
    empty=True 
)

# build scatterplot
scatter = alt.Chart(df_filtered).transform_calculate(
    x_val='datum[x_axis]',
    y_val='datum[y_axis]'
).mark_circle(size=50).encode(
    x=alt.X('x_val:Q', title='Feature 1'),
    y=alt.Y('y_val:Q', title='Feature 2'),
    color=alt.condition(genre_selection, 
                        alt.Color('track_genre:N', 
                                  scale=alt.Scale(domain=list(genre_color_map.keys()), range=list(genre_color_map.values())),
                                  legend=alt.Legend(title='Genre')),
                        alt.value('rgba(200, 200, 200, 0.3)')
),
    tooltip=[alt.Tooltip('track_genre:N', title='Genre')]
).add_params(
    x_axis,
    y_axis,
    genre_selection
).properties(
    width=600,
    height=400,
    title='Interactive Feature Plot by Genre'
)

scatter

# download as png and html
scatter.save("interactive_feature_scatter.html")
scatter.save("interactive_feature_scatter.png")

# download separate html for visualization
scatter.save("plots/interactive_feature_scatter.html")



# Figure: Distribution of Selected Feature Within Genre

In [55]:
# dropdown to select feature
feature_param = alt.param(
    name='feature_choice',
    bind=alt.binding_select(
        options=long_features,
        name='Feature: '
    ),
    value='popularity'
)

# dropdown to select genre
genre_param = alt.param(
    name='genre_choice',
    bind=alt.binding_select(
        options=sorted(df_filtered['track_genre'].unique()),
        name='Genre: '
    ),
    value=sorted(df_filtered['track_genre'].unique())[0]
)

# histogram
chart = alt.Chart(df_filtered).transform_filter(
    alt.datum.track_genre == genre_param
).transform_calculate(
    feature_value='datum[feature_choice]'
).mark_bar().encode(
    x=alt.X('feature_value:Q', bin=alt.Bin(maxbins=30), title='Feature Value'),
    y=alt.Y('count():Q', title='Count'),
    color=alt.Color('track_genre:N',
        scale=alt.Scale(
            domain=list(genre_color_map.keys()),
            range=list(genre_color_map.values())
        ),
        legend=None
    ),
    tooltip=[alt.Tooltip('count():Q', title='Count')]
).add_params(
    feature_param,
    genre_param
).properties(
    width=600,
    height=400,
    title='Distribution of Selected Feature Within Genre'
)

chart

# download as html and png 
chart.save("feature_distribution.html")
chart.save("feature_distribution.png")

# download separate html for visualization
chart.save("plots/feature_distribution.html")

# Preprocessing for D3 Visualizations

In [56]:
# first heatmap: correlation matrix of featues

subset_df = df[long_features]
corr_matrix = subset_df.corr().round(2)
corr_matrix.reset_index(inplace=True)
melted = corr_matrix.melt(id_vars='index', var_name='variable', value_name='value')
melted.rename(columns={'index': 'group'}, inplace=True)

# save file
melted.to_csv("corr_data.csv", index=False)

In [57]:
# second heatmap: mean characteristics by genre

# calculate feature means by genre
genre_feature_means = df_filtered.groupby('track_genre')[short_features].mean().reset_index()
genre_feature_means['popularity'] = genre_feature_means['popularity'] / 100 # scale of 0-1

melted2 = genre_feature_means.melt(id_vars='track_genre', var_name='feature', value_name='value')
melted2.rename(columns={'track_genre': 'genre'}, inplace=True)

# save file
melted2.to_csv("genre_feature_means.csv", index=False)