**Introduction**
-

**Question:**
- Artist impact: Analyze how artist involvement and attributes relate to a song's success.

**Hypothesis**
- Songs with multiple artists perform better on streaming platforms than songs by solo artists.



`streams` - Total number of streams on Spotify

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [2]:
spotify_data = pd.read_csv('data/spotify-2023.csv', encoding = 'latin1')

# Display data types inferred by pandas
spotify_data.head()


Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [3]:
spotify_data.shape

(953, 24)

In [4]:
spotify_data.nunique()

track_name              943
artist(s)_name          645
artist_count              8
released_year            50
released_month           12
released_day             31
in_spotify_playlists    879
in_spotify_charts        82
streams                 949
in_apple_playlists      234
in_apple_charts         172
in_deezer_playlists     348
in_deezer_charts         34
in_shazam_charts        198
bpm                     124
key                      11
mode                      2
danceability_%           72
valence_%                94
energy_%                 80
acousticness_%           98
instrumentalness_%       39
liveness_%               68
speechiness_%            48
dtype: int64

In [5]:
print(spotify_data.dtypes)

track_name              object
artist(s)_name          object
artist_count             int64
released_year            int64
released_month           int64
released_day             int64
in_spotify_playlists     int64
in_spotify_charts        int64
streams                 object
in_apple_playlists       int64
in_apple_charts          int64
in_deezer_playlists     object
in_deezer_charts         int64
in_shazam_charts        object
bpm                      int64
key                     object
mode                    object
danceability_%           int64
valence_%                int64
energy_%                 int64
acousticness_%           int64
instrumentalness_%       int64
liveness_%               int64
speechiness_%            int64
dtype: object


In [6]:
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key                   858 non-null    ob

In [101]:
for index, stream in spotify_data['streams'].items():
    if stream.isalnum():
        print()

In [88]:
spotify_data['streams'] = pd.to_numeric(spotify_data['streams'])

ValueError: Unable to parse string "BPM110KeyAModeMajorDanceability53Valence75Energy69Acousticness7Instrumentalness0Liveness17Speechiness3" at position 478

In [93]:
spotify_data.iloc[478, :]

track_name                            Love Grows (Where My Rosemary Goes)
artist(s)_name                                          Edison Lighthouse
artist_count                                                            1
released_year                                                        1970
released_month                                                          1
released_day                                                            1
in_spotify_playlists                                                 2877
in_spotify_charts                                                       0
streams                 BPM110KeyAModeMajorDanceability53Valence75Ener...
in_apple_playlists                                                     16
in_apple_charts                                                         0
in_deezer_playlists                                                    54
in_deezer_charts                                                        0
in_shazam_charts                      

In [22]:
# spotify_data.isna().sum()
missing_data = spotify_data.isna()
has_missing_value = missing_data.any()

for column, missing in has_missing_value.items():
    if missing:
        count = missing_data.sum()[column]
        print(f'Column {column} has {count} missing values')

Column in_shazam_charts has 50 missing values
Column key has 95 missing values


In [33]:
spotify_data.dropna(axis = 0, how = 'any', inplace = True)

In [34]:
spotify_data.isna().sum()

track_name              0
artist(s)_name          0
artist_count            0
released_year           0
released_month          0
released_day            0
in_spotify_playlists    0
in_spotify_charts       0
streams                 0
in_apple_playlists      0
in_apple_charts         0
in_deezer_playlists     0
in_deezer_charts        0
in_shazam_charts        0
bpm                     0
key                     0
mode                    0
danceability_%          0
valence_%               0
energy_%                0
acousticness_%          0
instrumentalness_%      0
liveness_%              0
speechiness_%           0
dtype: int64

**Data Preprocessing**
-

Since there are many data, so make it easier I will combine those playlists and charts into one colume, respectively. 

In [35]:
new_spotify_data = spotify_data.loc[:,:]

In [36]:
playlists_col = new_spotify_data.loc[:, ['in_spotify_playlists','in_apple_playlists','in_deezer_playlists']]

# Making new column
new_spotify_data['all_playlists'] = playlists_col.sum(axis=1)

# Drop those columes with playlists
new_spotify_data = new_spotify_data.drop(playlists_col.columns, axis = 1)

  new_spotify_data['all_playlists'] = playlists_col.sum(axis=1)


In [39]:
charts_col = new_spotify_data.loc[:, ['in_spotify_charts','in_apple_charts','in_deezer_charts','in_shazam_charts']]

# Making new Colume
new_spotify_data['all_charts'] = charts_col.sum(axis = 1)

# Drop those columes with chart
new_spotify_data.drop(charts_col.columns, axis = 1, inplace = True)

  new_spotify_data['all_charts'] = charts_col.sum(axis = 1)


In [51]:
# Making new colume
new_spotify_data['released_date'] = new_spotify_data.loc[:, 'released_year':'released_day'].astype(str).apply(lambda x:"-".join(x), axis = 1)

# Drop those date
new_spotify_data.drop(['released_year','released_month','released_day'], axis = 1, inplace = True)

In [55]:
new_spotify_data

Unnamed: 0,track_name,artist(s)_name,artist_count,streams,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,all_playlists,all_charts,released_date
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,141381703,125,B,Major,80,89,83,31,0,8,4,596,420,2023-7-14
1,LALA,Myke Towers,1,133716286,92,C#,Major,71,61,74,7,0,10,4,1522,188,2023-3-23
2,vampire,Olivia Rodrigo,1,140003974,138,F,Major,51,32,53,17,0,31,6,1491,334,2023-6-30
3,Cruel Summer,Taylor Swift,1,800840817,170,A,Major,55,58,72,11,0,11,15,7974,319,2019-8-23
4,WHERE SHE GOES,Bad Bunny,1,303236322,144,A,Minor,65,23,80,14,63,11,6,3217,198,2023-5-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,My Mind & Me,Selena Gomez,1,91473363,144,A,Major,60,24,39,57,0,8,3,1014,14,2022-11-3
949,Bigger Than The Whole Sky,Taylor Swift,1,121871870,166,F#,Major,42,7,24,83,1,12,6,1184,0,2022-10-21
950,A Veces (feat. Feid),"Feid, Paulo Londra",2,73513683,92,C#,Major,80,81,67,4,0,8,6,575,0,2022-11-3
951,En La De Ella,"Feid, Sech, Jhayco",3,133895612,97,C#,Major,82,67,77,8,0,12,5,1349,26,2022-10-20


**Visualization**
-

**1. Songs with multiple artists perform better on the number of `streams`?**
- In this case, my observe variables are `artist_count`, `all_playlists`, `all_charts`, and `streams`. 

In [29]:
%matplotlib inline

In [86]:
artists_set = new_spotify_data[['artist_count','all_playlists','all_charts','streams']].copy()

def convert_artist_count(count):
    if count == 1:
        return 'Solo'
    elif count == 2:
        return 'Duo'
    else:
        return 'Group'
    
artists_set['streams'] = artists_set.to_numeric()
    
artists_set['artists_type'] = artists_set['artist_count'].apply(convert_artist_count).astype('category')

print(artists_set)

print(artists_set.dtypes)

     artist_count  all_playlists  all_charts    streams artists_type
0               2            596         420  141381703          Duo
1               1           1522         188  133716286         Solo
2               1           1491         334  140003974         Solo
3               1           7974         319  800840817         Solo
4               1           3217         198  303236322         Solo
..            ...            ...         ...        ...          ...
948             1           1014          14   91473363         Solo
949             1           1184           0  121871870         Solo
950             2            575           0   73513683          Duo
951             3           1349          26  133895612        Group
952             1            809          21   96007391         Solo

[817 rows x 5 columns]
artist_count        int64
all_playlists       int64
all_charts          int64
streams            object
artists_type     category
dtype: object


In [78]:
group_artists = artists_set.groupby('artists_type')[['all_playlists','all_charts','streams']].sum(numeric_only=False)
print(group_artists)

fig = plt.figure(figsize=(7, 8))

sns.catplot(data = group_artists, x = 'artists_type', y = 'streams')

              all_playlists  all_charts  \
artists_type                              
Duo                  925225       16020   
Group                401492        4742   
Solo                2684802       31238   

                                                        streams  
artists_type                                                     
Duo           1413817031837062347259801125536340675056714383...  
Group         8644484211536456184395751010471012911077538501...  
Solo          1337162861400039748008408173032363225814937895...  


  self._figure.tight_layout(*args, **kwargs)


<seaborn.axisgrid.FacetGrid at 0x7f882a4e3c40>

Error in callback <function _draw_all_if_interactive at 0x7f883b29f3a0> (for post_execute):


RuntimeError: In draw_glyphs_to_bitmap: Could not convert glyph to bitmap (raster overflow; error code 0x62)

<Figure size 700x800 with 0 Axes>

RuntimeError: In draw_glyphs_to_bitmap: Could not convert glyph to bitmap (raster overflow; error code 0x62)

<Figure size 500x500 with 1 Axes>