In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv(filepath_or_buffer='/kaggle/input/discogs-electronic-music-dataset-1990-2000/discogs_electronic.csv')
df['lowest price (USD)'] = df['lowest_price'].apply(func=lambda x: np.nan if x == '--' else float(x.replace('$', '')))
df['median price (USD)'] = df['median_price'].apply(func=lambda x: np.nan if x == '--' else float(x.replace('$', '')))
df['highest price (USD)'] = df['highest_price'].apply(func=lambda x: np.nan if x == '--' else float(x.replace('$', '')))
df['mean rating'] = df['average_rating'].apply(func=lambda x: np.nan if x == '--' else float(x))
df['release year'] = df['release_date'].apply(lambda x: int(x[:4]))

df.head()

Unnamed: 0,artist,title,label,country,format,release_date,genre,styles,have,want,num_ratings,average_rating,lowest_price,median_price,highest_price,lowest price (USD),median price (USD),highest price (USD),mean rating,release year
0,Subterfuge,The Foundation Series Volume One,Visillusion,US,Vinyl,1997,Electronic,"House,Techno,Electro",93,423,31,3.81,$2.00,$39.02,$86.96,2.0,39.02,86.96,3.81,1997
1,Titiyo,My Body Says Yes,Arista,UK,Vinyl,1991-04-01,Electronic,House,136,30,11,4.36,$0.43,$1.88,$5.43,0.43,1.88,5.43,4.36,1991
2,Mariah Carey,Joy To The World,Columbia,US,Vinyl,1994-11,Electronic,"House,Garage House,Holiday",75,106,5,4.4,$1.99,$16.29,$33.71,1.99,16.29,33.71,4.4,1994
3,Rhythmstate,Everybody,Nitebeat,US,Vinyl,1997,Electronic,"House,Breakbeat",22,57,6,4.0,$2.00,$7.00,$25.00,2.0,7.0,25.0,4.0,1997
4,Exposé,"Stop, Listen, Look & Think",Arista,US,Vinyl,1990,Electronic,House,115,19,12,3.83,$0.79,$1.50,$4.34,0.79,1.5,4.34,3.83,1990


In [2]:
from plotly.express import bar
for column in ['label', 'country', 'format', 'genre']:
    bar(data_frame=df[column].value_counts().to_frame().reset_index().head(n=50), x=column, y='count').show()

In [3]:
from plotly.express import histogram
for column in ['have', 'want', 'num_ratings', 'lowest price (USD)', 'median price (USD)', 'highest price (USD)', 'mean rating', 'release year' ]:
    histogram(data_frame=df, x=column, log_y = column not in {'mean rating', 'release year'}).show()

In [4]:
from plotly.express import scatter
scatter(data_frame=df[df['median price (USD)'] > 20], x='want', y='have', color='median price (USD)', hover_name='title',
        facet_col='release year', facet_col_wrap=2, height=1000, log_x=True, log_y=True)

We expect want and have to be positively correlated, but by and large they do not explain high median prices.

In [5]:
from plotly.express import imshow
imshow(img=df[['have', 'want', 'median price (USD)', 'mean rating']].corr())

It is probably not surprising to see that median prices are driven by the number of people who want the item; this plot also shows that median prices are not generally driven by ratings or the number of people who want an item.

In [6]:
scatter(data_frame=df[(df['median price (USD)'] > 20) & (df['mean rating'].notnull())], x='want', y='median price (USD)', color='mean rating', hover_name='title',
        facet_col='release year', facet_col_wrap=2, height=1200, log_x=True, log_y=True, trendline='lowess')

In [7]:
scatter(data_frame=df[(df['median price (USD)'] > 20) & (df['mean rating'].notnull())], x='want', y='median price (USD)', color='mean rating', hover_name='title',
         log_x=True, log_y=True, trendline='lowess')

Not surprisingly high-priced, highly desired items tend to be highly rated.

In [8]:
histogram(data_frame=df[(df['median price (USD)'] > 20) & (df['mean rating'].notnull())], x='mean rating')

In [9]:
df[(df['median price (USD)'] > 20) & (df['mean rating'].notnull())]['mean rating'].mean()

4.3993005286701905

In fact items with a mean price above 20 USD have a mean rating that clusters around 4.4.