# Module 2: Know your Data

In [1]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('./spotify.csv')

df.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


# Mean, Median, and Mode

In [None]:
mean = df['in_spotify_playlists'].mean()
median = df['in_spotify_playlists'].median()

mode = stats.mode(df['in_spotify_playlists'])

print(f'Mean: {mean}')

# Median is the middle value of a sorted list of numbers
print(f'Median: {median}')

# Mode returns a tuple with the mode and the count of the mode
print(f'Mode: {mode[0]}')

# Boxplot
I want to construct a boxplot for the ```bpm``` column.

We will need ```matplotlib``` for this.

In [None]:
plt.boxplot(df['bpm'], vert = False)
plt.show()

# Five Number Summary
```numpy``` will help here. I want FNS for ```bpm``` column.

- min
- lower quartile
- median
- upper quartile
- max

In [6]:
min = np.min(df['bpm'])
q1 = np.percentile(df['bpm'], 25)
median = np.percentile(df['bpm'], 50)
q3 = np.percentile(df['bpm'], 75)
max = np.max(df['bpm'])

# we can do this on one line
five_number_summary = np.percentile(df['bpm'], [0, 25, 50, 75, 100])

print(f'----Five Number Summary----')
print(f'Min: {five_number_summary[0]}')
print(f'Q1: {five_number_summary[1]}')
print(f'Median: {five_number_summary[2]}')
print(f'Q3: {five_number_summary[3]}')
print(f'Max: {five_number_summary[4]}')

NameError: name 'np' is not defined

# How to Write FNS as a Function

If I am doing an operation (e.g., FNS) multiple times on multiple columns, it might be helpful to have a function to do the heavy lifting, and remove repetition.

#### Functions MUST be declared before you can call them.

In [None]:
def five_number_summary(column):
    try:
        five_num_summary = np.percentile(column, [0, 25, 50, 75, 100])
        print(f'----Five Number Summary----')
        print(f'Min: {five_num_summary[0]}')
        print(f'Q1: {five_num_summary[1]}')
        print(f'Median: {five_num_summary[2]}')
        print(f'Q3: {five_num_summary[3]}')
        print(f'Max: {five_num_summary[4]}')
    except:
        print('Please provide a valid column')

five_number_summary(df['streams'])

# TQR, Outliers, Standard Deviation, and Variance

In [None]:
# Assuming we have the five number summary
iqr = q3 - q1
upper_bound = q3 + 1.5 * iqr
lower_bound = q1 - 1.5 * iqr

print(f'Inner Quartile Range: {iqr}')
print(f'Upper Bound: {upper_bound}')
print(f'Lower Bound: {lower_bound}')

standard_deviation = df['bpm'].std()
variance = standard_deviation ** 2

print(f'Standard Deviation: {standard_deviation:.3f}')
print(f'Variance: {variance:.3f}')

# Histograms

I want a histogram for the ```released_month``` column.

In [None]:
# Using pyplot to plot a histogram
plt.hist(df.released_month, bins = 12, color = 'orange', alpha = 0.5)
plt.title('Histogram of Released Month')
plt.xlabel('Month')
plt.ylabel('Frequency')
plt.show()

# Scatter Plot

I want a scatter plot to compare ```streams``` to ```in_spotify_playlists```.

In [None]:
# Using pyplot to plot a scatter plot
plt.scatter(x = df.in_spotify_playlists, y = df.streams)
plt.title('Scatter plot of In Spotify Playlists vs Streams')
plt.xlabel('In Spotify Playlists')
plt.ylabel('Streams')
plt.show()

# You can also use pandas to plot a scatter plot
df.plot.scatter(x = 'in_spotify_playlists', y = 'streams')