In [1]:
# import libraries
import pandas as pd
import plotly.graph_objs as go

In [2]:
# load dataset
reviews_df = pd.read_csv('../data/rym_top_5000_all_time.csv', index_col = False)
reviews_df.head()

Unnamed: 0,Ranking,Album,Artist Name,Release Date,Genres,Descriptors,Average Rating,Number of Ratings,Number of Reviews
0,1.0,OK Computer,Radiohead,16 June 1997,"Alternative Rock, Art Rock","melancholic, anxious, futuristic, alienation, ...",4.23,70382,1531
1,2.0,Wish You Were Here,Pink Floyd,12 September 1975,"Progressive Rock, Art Rock","melancholic, atmospheric, progressive, male vo...",4.29,48662,983
2,3.0,In the Court of the Crimson King,King Crimson,10 October 1969,"Progressive Rock, Art Rock","fantasy, epic, progressive, philosophical, com...",4.3,44943,870
3,4.0,Kid A,Radiohead,3 October 2000,"Art Rock, Experimental Rock, Electronic","cold, melancholic, futuristic, atmospheric, an...",4.21,58590,734
4,5.0,To Pimp a Butterfly,Kendrick Lamar,15 March 2015,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap","political, conscious, poetic, protest, concept...",4.27,44206,379


In [3]:
# dataframe information
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Ranking            5000 non-null   float64
 1   Album              5000 non-null   object 
 2   Artist Name        5000 non-null   object 
 3   Release Date       5000 non-null   object 
 4   Genres             5000 non-null   object 
 5   Descriptors        4886 non-null   object 
 6   Average Rating     5000 non-null   float64
 7   Number of Ratings  5000 non-null   object 
 8   Number of Reviews  5000 non-null   int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 351.7+ KB


In [4]:
# any duplicates
reviews_df.duplicated().sum()

0

In [5]:
# how about null values
null = reviews_df.isna().sum()/len(reviews_df)
null[null > 0].sort_values()

Descriptors    0.0228
dtype: float64

In [6]:
# columns with no missing values
no_nulls = set(reviews_df.columns[reviews_df.isnull().mean()==0])
no_nulls

{'Album',
 'Artist Name',
 'Average Rating',
 'Genres',
 'Number of Ratings',
 'Number of Reviews',
 'Ranking',
 'Release Date'}

# Notes on data
1. Release date is an object field
2. Number of Ratings column is an object field
3. Descriptors field is not needed

In [7]:
def rename_cols(df):
    '''
    rename_cols: rename columns to lower case, changing spaces to underscores
    Input: dataframe
    Output: dataframe
    '''
    df.columns = [i.replace(' ', '_').lower() for i in df.columns]
    
    return(df)

In [8]:
album_rev_df = rename_cols(reviews_df)
album_rev_df.columns

Index(['ranking', 'album', 'artist_name', 'release_date', 'genres',
       'descriptors', 'average_rating', 'number_of_ratings',
       'number_of_reviews'],
      dtype='object')

In [9]:
def change_dtype(df, cols, type):
    '''
    change_dtype: change data type of columns 
    input: dataframe, list of columns, data type
    output: dataframe with changed column data types
    '''
    for col in cols:
        df[col] = df[col].astype(type)
        
    return(df)

In [10]:
# Update datatypes

int_cols = ['number_of_ratings'] # change cols to int

# for number_of_ratings, first remove comma
album_rev_df['number_of_ratings'] = album_rev_df['number_of_ratings'].str.replace(',', '')

change_dtype(album_rev_df, int_cols, 'int64')

# change release_date to datetime
album_rev_df['release_date'] = pd.to_datetime(album_rev_df['release_date'])

# check
album_rev_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ranking            5000 non-null   float64       
 1   album              5000 non-null   object        
 2   artist_name        5000 non-null   object        
 3   release_date       5000 non-null   datetime64[ns]
 4   genres             5000 non-null   object        
 5   descriptors        4886 non-null   object        
 6   average_rating     5000 non-null   float64       
 7   number_of_ratings  5000 non-null   int64         
 8   number_of_reviews  5000 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 351.7+ KB


In [11]:
# delete unneeded columns
album_rev_df = album_rev_df.drop(columns = ['descriptors'], axis = 1)

# check
album_rev_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ranking            5000 non-null   float64       
 1   album              5000 non-null   object        
 2   artist_name        5000 non-null   object        
 3   release_date       5000 non-null   datetime64[ns]
 4   genres             5000 non-null   object        
 5   average_rating     5000 non-null   float64       
 6   number_of_ratings  5000 non-null   int64         
 7   number_of_reviews  5000 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(3)
memory usage: 312.6+ KB


In [12]:
top_ranked_albums = album_rev_df.query('ranking >= 1 & ranking <=10')
top_ranked_albums.head()

Unnamed: 0,ranking,album,artist_name,release_date,genres,average_rating,number_of_ratings,number_of_reviews
0,1.0,OK Computer,Radiohead,1997-06-16,"Alternative Rock, Art Rock",4.23,70382,1531
1,2.0,Wish You Were Here,Pink Floyd,1975-09-12,"Progressive Rock, Art Rock",4.29,48662,983
2,3.0,In the Court of the Crimson King,King Crimson,1969-10-10,"Progressive Rock, Art Rock",4.3,44943,870
3,4.0,Kid A,Radiohead,2000-10-03,"Art Rock, Experimental Rock, Electronic",4.21,58590,734
4,5.0,To Pimp a Butterfly,Kendrick Lamar,2015-03-15,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap",4.27,44206,379


In [66]:
# we'll look at the top 10 ranked albums as of 2021,
# so will take a subset of the data

top_ranked_albums = album_rev_df.query('ranking >= 1 & ranking <=10')

# initialize graph_one list

graph_one = [] 

# define graph_one - top 10 ranked albums as of 2021

graph_one.append(
    go.Bar(
        y = top_ranked_albums.album,
        x = top_ranked_albums.ranking,
        orientation = 'h',
        text=top_ranked_albums.artist_name,
        marker = dict(
                line=dict(
                    width=2,
                    color='white'
                    )
                )
       )
    )

# define layout_one 

layout_one = dict(title = 'Top 10 Ranked Albums of All Time (as of 2021)',
            xaxis_title = 'Ranking',
            yaxis_title = 'Album Name',
            xaxis = dict(
                tickmode = 'linear',
                tick0 = 0,
                dtick = 1.0
                )
            )

print(graph_one)
print(layout_one)


[Bar({
    'marker': {'line': {'color': 'white', 'width': 2}},
    'orientation': 'h',
    'text': array(['Radiohead', 'Pink Floyd', 'King Crimson', 'Radiohead',
                   'Kendrick Lamar', 'My Bloody Valentine', 'Pink Floyd', 'The Beatles',
                   'The Velvet Underground & Nico', 'David Bowie'], dtype=object),
    'x': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]),
    'y': array(['OK Computer', 'Wish You Were Here', 'In the Court of the Crimson King',
                'Kid A', 'To Pimp a Butterfly', 'Loveless', 'The Dark Side of the Moon',
                'Abbey Road', 'The Velvet Underground & Nico',
                'The Rise and Fall of Ziggy Stardust and the Spiders From Mars'],
               dtype=object)
})]
{'title': 'Top 10 Ranked Albums of All Time (as of 2021)', 'xaxis_title': 'Ranking', 'yaxis_title': 'Album Name', 'xaxis': {'tickmode': 'linear', 'tick0': 0, 'dtick': 1.0}}


In [57]:

fig = go.Figure()

fig.add_trace(
    go.Bar(
    y=top_ranked_albums.album,
    x=top_ranked_albums.ranking,
    orientation='h',
    text=top_ranked_albums.artist_name
   )
)

fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 1.0
    )
)

fig.update_traces(marker=dict(line=dict(width=2,
                                        color='white')))

fig.update_layout(
    title = 'Top 10 Ranked Albums of All Time (as of 2021)',
    xaxis_title = 'Ranking',
    yaxis_title = 'Album Name'
)

fig.show()

In [65]:
# initialize graph_two list

graph_two = [] 

# define graph_two - the top 10 ranked albums by release date 

graph_two.append(
    go.Scatter(
        y=top_ranked_albums.album,
        x=top_ranked_albums.release_date,
        orientation='h',
        mode='markers',
        text=top_ranked_albums['artist_name'],
        marker=dict(size=12, 
                    line=dict(width=2, 
                              color='white'
                             )
                   )
    )
)

# define layout_two

layout_two = dict(title = 'Top 10 Ranked Albums of All Time (as of 2021) by Year Released',
                  xaxis_title = 'Year Released',
                  yaxis_title = 'Album Name'
                 )

print(graph_two)
print(layout_two)

[Scatter({
    'marker': {'line': {'color': 'white', 'width': 2}, 'size': 12},
    'mode': 'markers',
    'orientation': 'h',
    'text': array(['Radiohead', 'Pink Floyd', 'King Crimson', 'Radiohead',
                   'Kendrick Lamar', 'My Bloody Valentine', 'Pink Floyd', 'The Beatles',
                   'The Velvet Underground & Nico', 'David Bowie'], dtype=object),
    'x': array([datetime.datetime(1997, 6, 16, 0, 0),
                datetime.datetime(1975, 9, 12, 0, 0),
                datetime.datetime(1969, 10, 10, 0, 0),
                datetime.datetime(2000, 10, 3, 0, 0),
                datetime.datetime(2015, 3, 15, 0, 0),
                datetime.datetime(1991, 11, 4, 0, 0),
                datetime.datetime(1973, 3, 23, 0, 0),
                datetime.datetime(1969, 9, 26, 0, 0),
                datetime.datetime(1967, 3, 12, 0, 0),
                datetime.datetime(1972, 6, 16, 0, 0)], dtype=object),
    'y': array(['OK Computer', 'Wish You Were Here', 'In the Court of 

In [14]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
    y=top_ranked_albums.album,
    x=top_ranked_albums.release_date,
    orientation='h',
    mode='markers',
    text=top_ranked_albums['artist_name']
   )
)

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='white')))

fig.update_layout(
    title = 'Top 10 Ranked Albums of All Time (as of 2021) by Year Released',
    xaxis_title = 'Year Released',
    yaxis_title = 'Album Name'
)

fig.show()

In [15]:
most_ranked_artists = album_rev_df.artist_name.value_counts().nlargest(10)
most_ranked_artists.to_frame().reset_index(drop = True, inplace = True)

In [16]:
most_ranked_artists.info()

<class 'pandas.core.series.Series'>
Index: 10 entries, Various Artists to Herbie Hancock
Series name: artist_name
Non-Null Count  Dtype
--------------  -----
10 non-null     int64
dtypes: int64(1)
memory usage: 160.0+ bytes


In [68]:
# for the third graph, we want to find the artists with the most ranked albums,
# so we will subset the data 

most_ranked_artists = album_rev_df.artist_name.value_counts().nlargest(10) # take the top 10 artists
most_ranked_artists.to_frame().reset_index(drop = True, inplace = True) # create dataframe from list

# initialize graph_three

graph_three = []

# define graph_three

graph_three.append(
    go.Bar(
        y=most_ranked_artists,
        x=most_ranked_artists.index,
        text=most_ranked_artists
    )
)

# define layout_three

layout_three = dict(title = 'Artists with most ranked albums (as of 2021)',
                    xaxis_title = 'Artist',
                    yaxis_title = 'Count'
                   )

print(graph_three)
print(layout_three)

[Bar({
    'text': array([25., 17., 14., 13., 12., 11., 10.,  9.,  9.,  9.]),
    'x': array(['Various Artists', 'Miles Davis', 'John Coltrane', 'Tom Waits',
                'John Williams', 'Bob Dylan', 'The Beatles', 'The Fall',
                'Pharoah Sanders', 'Herbie Hancock'], dtype=object),
    'y': array([25, 17, 14, 13, 12, 11, 10,  9,  9,  9])
})]
{'title': 'Artists with most ranked albums (as of 2021)', 'xaxis_title': 'Artist', 'yaxis_title': 'Count'}


In [17]:
fig = go.Figure()

fig.add_trace(
    go.Bar(
        y=most_ranked_artists,
        x=most_ranked_artists.index,
        text=most_ranked_artists
   )
)

fig.update_traces(marker=dict(line=dict(width=2,
                                        color='white')))

fig.update_layout(
    title = 'Artists with most ranked albums (as of 2021)',
    xaxis_title = 'Artist',
    yaxis_title = 'Count'
)

fig.show()

In [152]:
album_rev_df.average_rating.nlargest(15)

13    4.34
2     4.30
15    4.30
1     4.29
4     4.27
9     4.26
11    4.26
7     4.25
12    4.25
5     4.24
0     4.23
8     4.23
10    4.23
18    4.23
3     4.21
Name: average_rating, dtype: float64

In [71]:
# initialize graph_four

graph_four = []

# define graph_four

graph_four.append(
    go.Bar(
        x = album_rev_df.album,
        y = album_rev_df.average_rating.nlargest(10),
        text = album_rev_df.artist_name,
        marker = dict(line = dict(width = 2,
                                  color='white'))
        )
)

# define layout_four

layout_four = dict(title = 'Albums with highest average ratings - Top 10 (as of 2021)',
                  xaxis_title = 'Album',
                  yaxis_title = 'Rating',
                  yaxis = dict(tickmode = 'linear',
                              tick0 = 0.0,
                              dtick = 0.5),
                  autosize=False,
                  width = 1000,
                  height = 700)
                
print(graph_four)
print(layout_four)

[Bar({
    'marker': {'line': {'color': 'white', 'width': 2}},
    'text': array(['Radiohead', 'Pink Floyd', 'King Crimson', ..., 'And Also the Trees',
                   'Studio für elektronische Musik des Westdeutschen Rundfunks, Köln & Karlheinz Stockhausen',
                   'Minnie Riperton'], dtype=object),
    'x': array(['OK Computer', 'Wish You Were Here', 'In the Court of the Crimson King',
                ..., '(Listen For) The Rag and Bone Man',
                'Hymnen für elektronische und konkrete Klänge',
                'Adventures in Paradise'], dtype=object),
    'y': array([4.34, 4.3 , 4.3 , 4.29, 4.27, 4.26, 4.26, 4.25, 4.25, 4.24])
})]
{'title': 'Artists with highest average ratings - Top 10 (as of 2021)', 'xaxis_title': 'Artist', 'yaxis_title': 'Rating', 'yaxis': {'tickmode': 'linear', 'tick0': 0.0, 'dtick': 0.5}, 'autosize': False, 'width': 1000, 'height': 700}


In [72]:
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=album_rev_df.album,
        y=album_rev_df.average_rating.nlargest(10),
        text=album_rev_df.artist_name
   )
)

fig.update_traces(marker=dict(line=dict(width=2, color='white')))

fig.update_layout(
    yaxis = dict(
        tickmode = 'linear',
        tick0 = 0.0,
        dtick = 0.5
    ),
    autosize=False,
    width=1000,
    height=700
)

fig.update_layout(
    title = 'Artists with highest average ratings (as of 2021)',
    xaxis_title = 'Artist',
    yaxis_title = 'Rating'
)

fig.show()