# Imports and reading data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# show all columns
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('spotify_dataset.csv')
df.drop(["Index","Week of Highest Charting","Weeks Charted"], axis=1, inplace=True)
df.head(3)

Unnamed: 0,Highest Charting Position,Number of Times Charted,Song Name,Streams,Artist,Artist Followers,Song ID,Genre,Release Date,Popularity,Danceability,Energy,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration (ms),Valence,Chord
0,1,8,Beggin',48633449,Måneskin,3377762,3Wrjm47oTz2sjIgck11l5e,"['indie rock italiano', 'italian pop']",2017-12-08,100,0.714,0.8,-4.808,0.0504,0.127,0.359,134.002,211560,0.589,B
1,2,3,STAY (with Justin Bieber),47248719,The Kid LAROI,2230022,5HCyWlXZPP0y6Gqq8TgA20,['australian hip hop'],2021-07-09,99,0.591,0.764,-5.484,0.0483,0.0383,0.103,169.928,141806,0.478,C#/Db
2,1,11,good 4 u,40162559,Olivia Rodrigo,6266514,4ZtFanR9U6ndgddUvNcjcG,['pop'],2021-05-21,99,0.563,0.664,-5.044,0.154,0.335,0.0849,166.928,178147,0.688,A


# Get rid of bad rows

In [3]:
# show current counts
df.notnull().sum()

Highest Charting Position    1556
Number of Times Charted      1556
Song Name                    1556
Streams                      1556
Artist                       1556
Artist Followers             1556
Song ID                      1556
Genre                        1556
Release Date                 1556
Popularity                   1556
Danceability                 1556
Energy                       1556
Loudness                     1556
Speechiness                  1556
Acousticness                 1556
Liveness                     1556
Tempo                        1556
Duration (ms)                1556
Valence                      1556
Chord                        1556
dtype: int64

In [4]:
# convert blanks to NaN
df = df.replace(' ', np.nan)
df.isna().sum()

Highest Charting Position     0
Number of Times Charted       0
Song Name                     0
Streams                       0
Artist                        0
Artist Followers             11
Song ID                      11
Genre                        11
Release Date                 11
Popularity                   11
Danceability                 11
Energy                       11
Loudness                     11
Speechiness                  11
Acousticness                 11
Liveness                     11
Tempo                        11
Duration (ms)                11
Valence                      11
Chord                        11
dtype: int64

In [5]:
# drop NaN and show updated counts
df.dropna(inplace=True)
df.notnull().sum()

Highest Charting Position    1545
Number of Times Charted      1545
Song Name                    1545
Streams                      1545
Artist                       1545
Artist Followers             1545
Song ID                      1545
Genre                        1545
Release Date                 1545
Popularity                   1545
Danceability                 1545
Energy                       1545
Loudness                     1545
Speechiness                  1545
Acousticness                 1545
Liveness                     1545
Tempo                        1545
Duration (ms)                1545
Valence                      1545
Chord                        1545
dtype: int64

## Check data types

In [6]:
df.dtypes

Highest Charting Position     int64
Number of Times Charted       int64
Song Name                    object
Streams                      object
Artist                       object
Artist Followers             object
Song ID                      object
Genre                        object
Release Date                 object
Popularity                   object
Danceability                 object
Energy                       object
Loudness                     object
Speechiness                  object
Acousticness                 object
Liveness                     object
Tempo                        object
Duration (ms)                object
Valence                      object
Chord                        object
dtype: object

In [7]:
# convert object to string and then replace comma with empty space
df["Streams"] = df["Streams"].str.replace(',', "")

In [8]:
## Change all columns
cols_to_change = ["Streams", "Artist Followers", "Popularity", "Danceability", 
                  "Energy", "Loudness", "Speechiness", "Acousticness", "Liveness",
                  "Tempo", "Duration (ms)", "Valence"] 
for i in range(len(cols_to_change)):
    df[cols_to_change[i]] = pd.to_numeric(df[cols_to_change[i]])

df["Release Date"] = pd.to_datetime(df["Release Date"])
df.dtypes

Highest Charting Position             int64
Number of Times Charted               int64
Song Name                            object
Streams                               int64
Artist                               object
Artist Followers                      int64
Song ID                              object
Genre                                object
Release Date                 datetime64[ns]
Popularity                            int64
Danceability                        float64
Energy                              float64
Loudness                            float64
Speechiness                         float64
Acousticness                        float64
Liveness                            float64
Tempo                               float64
Duration (ms)                         int64
Valence                             float64
Chord                                object
dtype: object

In [9]:
df.head(3)

Unnamed: 0,Highest Charting Position,Number of Times Charted,Song Name,Streams,Artist,Artist Followers,Song ID,Genre,Release Date,Popularity,Danceability,Energy,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration (ms),Valence,Chord
0,1,8,Beggin',48633449,Måneskin,3377762,3Wrjm47oTz2sjIgck11l5e,"['indie rock italiano', 'italian pop']",2017-12-08,100,0.714,0.8,-4.808,0.0504,0.127,0.359,134.002,211560,0.589,B
1,2,3,STAY (with Justin Bieber),47248719,The Kid LAROI,2230022,5HCyWlXZPP0y6Gqq8TgA20,['australian hip hop'],2021-07-09,99,0.591,0.764,-5.484,0.0483,0.0383,0.103,169.928,141806,0.478,C#/Db
2,1,11,good 4 u,40162559,Olivia Rodrigo,6266514,4ZtFanR9U6ndgddUvNcjcG,['pop'],2021-05-21,99,0.563,0.664,-5.044,0.154,0.335,0.0849,166.928,178147,0.688,A


# Split Chord by "/" and only keep split[0]?