# CVIT Summer Workshop 2020

Day 01 (Part 01): Introduction to Pandas

Instructor: Karthik Gupta

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

## Part 01: Loading Data

Load the MusicTop50.csv file into a pandas dataframe.

In [2]:
base_url = 'https://gist.githubusercontent.com/alti-tude/710cb9d4dfc7ebcd0afb9cf93d8f6a8d/raw/574730ba009e69b81d6c79f2fef2c3dd5145db0a/'

In [3]:
music_df = pd.read_csv(base_url + 'MusicTop50.csv', index_col = 0)

## Part 02: Visualizing Data

Visualize the top 5 and bottom 5 rows of the dataset.

In [4]:
music_df.head()

Unnamed: 0,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability..db..,Loudness..,Liveness.,Valence (),Length,Acousticness,Speechiness,Popularity
1,Se�orita,Shawn Mendes,canadian pop,117.0,55.0,76,-6,8.0,75.0,191.0,4,3,79.0
2,China,Anuel AA,reggaeton flow,105.0,81.0,79,-4,8.0,61.0,302.0,8,9,92.0
3,boyfriend (with Social House),Ariana Grande,dance pop,190.0,80.0,40,-4,16.0,70.0,186.0,12,46,85.0
4,Beautiful People (feat. Khalid),Ed Sheeran,pop,93.0,65.0,64,-8,8.0,55.0,198.0,12,19,86.0
5,Goodbyes (Feat. Young Thug),Post Malone,dfw rap,,65.0,58,-4,11.0,18.0,175.0,45,7,94.0


In [5]:
music_df.tail()

Unnamed: 0,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability..db..,Loudness..,Liveness.,Valence (),Length,Acousticness,Speechiness,Popularity
46,One Thing Right,Marshmello,brostep,88.0,62.0,66,-2,,44.0,182.0,7,5,88.0
47,Te Robar�,Nicky Jam,latin,176.0,75.0,67,-4,8.0,80.0,202.0,24,6,88.0
48,Happier,Marshmello,brostep,100.0,79.0,69,-3,17.0,67.0,214.0,19,5,88.0
49,Call You Mine,The Chainsmokers,edm,104.0,70.0,59,-6,41.0,50.0,218.0,23,3,88.0
50,Cross Me (feat. Chance the Rapper & PnB Rock),Ed Sheeran,pop,95.0,79.0,75,-6,7.0,61.0,206.0,21,12,82.0


## Part 03: Cleaning Data

Observe the column names. They have extraneous characters that might make it difficult to use the correct column name, during later processing. Rename the columns to something more convenient.

In [6]:
music_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 50
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Track.Name          50 non-null     object 
 1   Artist.Name         50 non-null     object 
 2   Genre               50 non-null     object 
 3   Beats.Per.Minute    48 non-null     float64
 4   Energy              49 non-null     float64
 5   Danceability..db..  50 non-null     int64  
 6   Loudness..          50 non-null     int64  
 7   Liveness.           48 non-null     float64
 8   Valence ()          49 non-null     float64
 9   Length              47 non-null     float64
 10  Acousticness        50 non-null     int64  
 11  Speechiness         50 non-null     int64  
 12  Popularity          49 non-null     float64
dtypes: float64(6), int64(4), object(3)
memory usage: 5.5+ KB


In [7]:
temp_df = music_df.drop_duplicates()
print("Duplicateless DataFrame Shape: ", temp_df.shape)

Duplicateless DataFrame Shape:  (50, 13)


In [8]:
print("Columns: ", music_df.columns)

Columns:  Index(['Track.Name', 'Artist.Name', 'Genre', 'Beats.Per.Minute', 'Energy',
       'Danceability..db..', 'Loudness..', 'Liveness.', 'Valence ()', 'Length',
       'Acousticness', 'Speechiness', 'Popularity'],
      dtype='object')


In [9]:
music_df.rename(columns = {
    'Danceability..db..': 'Danceability',
    'Loudness..' : 'Loudness', 
    'Liveness.' : 'Liveness',
    'Valence ()' : 'Valence'
}, inplace = True)

print("Columns: ", music_df.columns)

Columns:  Index(['Track.Name', 'Artist.Name', 'Genre', 'Beats.Per.Minute', 'Energy',
       'Danceability', 'Loudness', 'Liveness', 'Valence', 'Length',
       'Acousticness', 'Speechiness', 'Popularity'],
      dtype='object')


You have some values which are missing. Try experimenting with:
    - Dropping the row
    - Filling with mean and median

In [10]:
music_df.isnull()

Unnamed: 0,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness,Liveness,Valence,Length,Acousticness,Speechiness,Popularity
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,True,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,True,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False,False
10,False,False,False,False,False,False,False,False,False,False,False,False,False


In [11]:
music_df.dropna()

Unnamed: 0,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness,Liveness,Valence,Length,Acousticness,Speechiness,Popularity
1,Se�orita,Shawn Mendes,canadian pop,117.0,55.0,76,-6,8.0,75.0,191.0,4,3,79.0
2,China,Anuel AA,reggaeton flow,105.0,81.0,79,-4,8.0,61.0,302.0,8,9,92.0
3,boyfriend (with Social House),Ariana Grande,dance pop,190.0,80.0,40,-4,16.0,70.0,186.0,12,46,85.0
4,Beautiful People (feat. Khalid),Ed Sheeran,pop,93.0,65.0,64,-8,8.0,55.0,198.0,12,19,86.0
6,I Don't Care (with Justin Bieber),Ed Sheeran,pop,102.0,68.0,80,-5,9.0,84.0,220.0,9,4,84.0
8,How Do You Sleep?,Sam Smith,pop,111.0,68.0,48,-5,8.0,35.0,202.0,15,9,90.0
9,Old Town Road - Remix,Lil Nas X,country rap,136.0,62.0,88,-6,11.0,64.0,157.0,5,10,87.0
10,bad guy,Billie Eilish,electropop,135.0,43.0,70,-11,10.0,56.0,194.0,33,38,95.0
13,Someone You Loved,Lewis Capaldi,pop,110.0,41.0,50,-6,11.0,45.0,182.0,75,3,88.0
14,Otro Trago - Remix,Sech,panamanian pop,176.0,79.0,73,-2,6.0,76.0,288.0,7,20,87.0


In [12]:
music_df.fillna(music_df.mean(), inplace = True)

music_df.head()

Unnamed: 0,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness,Liveness,Valence,Length,Acousticness,Speechiness,Popularity
1,Se�orita,Shawn Mendes,canadian pop,117.0,55.0,76,-6,8.0,75.0,191.0,4,3,79.0
2,China,Anuel AA,reggaeton flow,105.0,81.0,79,-4,8.0,61.0,302.0,8,9,92.0
3,boyfriend (with Social House),Ariana Grande,dance pop,190.0,80.0,40,-4,16.0,70.0,186.0,12,46,85.0
4,Beautiful People (feat. Khalid),Ed Sheeran,pop,93.0,65.0,64,-8,8.0,55.0,198.0,12,19,86.0
5,Goodbyes (Feat. Young Thug),Post Malone,dfw rap,119.354167,65.0,58,-4,11.0,18.0,175.0,45,7,94.0


## Part 05: Plotting Data

Use Plotly to create:

1. A line plot having two lines.
    - Energy (y-axis) vs Loudness (x-axis)
    - Beats.Per.Minute vs Loudness
    - Also, show the legend and the labels for x-axis

In [13]:
music_df.columns

Index(['Track.Name', 'Artist.Name', 'Genre', 'Beats.Per.Minute', 'Energy',
       'Danceability', 'Loudness', 'Liveness', 'Valence', 'Length',
       'Acousticness', 'Speechiness', 'Popularity'],
      dtype='object')

In [17]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = music_df.sort_values(by = "Loudness")["Loudness"],
                         y = music_df.sort_values(by = "Loudness")["Energy"],
                        name = "Energy"))

fig.add_trace(go.Scatter(x = music_df.sort_values(by = "Loudness")["Loudness"],
                         y = music_df.sort_values(by = "Loudness")["Beats.Per.Minute"],
                        name = "Beats Per Minute"))

fig.update_layout(xaxis_title = "Loudness")

In [15]:
names, counts = np.unique(music_df['Artist.Name'], return_counts = True)

df2 = pd.DataFrame.from_dict(dict(zip(names, counts)),
                            orient = 'index').reset_index()
df2.columns = ["Name", "Counts"]
df2.sort_values("Counts", ascending = False).head()

Unnamed: 0,Name,Counts
9,Ed Sheeran,4
19,Lil Nas X,2
32,Shawn Mendes,2
25,Marshmello,2
28,Post Malone,2


In [21]:
df2.sort_values("Counts", ascending = False, inplace = True)

fig = go.Figure(data = go.Bar(x = df2.iloc[:10].Name, y = df2.iloc[:10].Counts))
fig.update_layout(xaxis_title = "Artists", yaxis_title = "Number of Songs")