In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv(filepath_or_buffer='/kaggle/input/top-100-billboard/billboard.csv', parse_dates=['week_id'])
df['score'] = 101 - df['week_position']
df['previous_score'] = df['previous_week_position'].apply(lambda x: np.nan if np.isnan(x) else 101 - x)
df['best_score'] = 101 - df['peak_position']
df['year'] = df['week_id'].dt.year
df.head()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
from plotly.express import histogram
histogram(data_frame=df, x='peak_position')

In [None]:
df['peak_position'].value_counts(normalize=True).nlargest(n=1)

Is it a surprise that fully 5% of songs reach the top spot? It is to me.

In [None]:
columns = ['score', 'previous_score', 'best_score', 'weeks_on_chart', 'song', 'performer', 'week_id']
not_null_df = df[columns].dropna()
not_null_df.shape

For the following plots we are only looking at songs already in the chart, i.e. no weeks where the previous score is nan.

In [None]:
from plotly.express import scatter
scatter(data_frame=not_null_df.sample(n=10000), x='score', y='previous_score', color='best_score',
        hover_name='song', hover_data=['performer', 'week_id'])

Converting the positions to scores just gives bigger numbers rather than smaller numbers to the top songs, so our scatter now fits our intuition with better chart performance in the top right rather than bottom left corner.

In [None]:
from random import sample
songs = sample(population=not_null_df['song'].unique().tolist(), k=1000)
songs_df = not_null_df[not_null_df['song'].isin(songs)]
print(songs_df.shape)
scatter(data_frame=songs_df, x='score', y='previous_score', color='best_score', hover_name='song', hover_data=['performer', 'week_id'])

In [None]:
performers = sample(population=not_null_df['performer'].unique().tolist(), k=1000)
performers_df = not_null_df[not_null_df['performer'].isin(performers)]
print(performers_df.shape)
scatter(data_frame=performers_df, x='score', y='previous_score', color='best_score', hover_name='song', hover_data=['performer', 'week_id'])

In [None]:
from plotly.express import bar
bar(data_frame=df['performer'].value_counts().nlargest(n=20).to_frame().reset_index(), x='performer', y='count')

These are the artists with the most song/weeks in the dataset. We can with a little work see who had the best year(s).

In [None]:
bar(data_frame=df[['performer', 'year']].value_counts().nlargest(n=50).to_frame().reset_index(), x='performer', y='count', color='year')

Because each performer takes up linear space we can't put a lot of data in this chart, and we probably end up inviting comparisons between artists that are not reasonable. Maybe this data would look better as a scatter plot.

In [None]:
scatter(data_frame=df[['performer', 'year']].value_counts().nlargest(n=1000).to_frame().reset_index(), x='year', y='count', color='year',
       hover_name='performer')

We can fit a lot more data into this chart, probably to the point of diminishing returns. This gives us a sense of what chart domination looks like on an annual-bucket basis. I'm not sure using the year for both the x dimension and the color is buying us much here, but we only have three data dimensions to plot and one of those is categorical.