In [66]:
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = 'browser'

In [None]:
spotify_data = pd.read_csv('spotify.csv', encoding='latin1')
pd.set_option('display.max_columns', None)
display(spotify_data.head())

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Apple Music Playlist Count,AirPlay Spins,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,92.0,84274754,1713126,5767700,651565900.0,5332281936.0,150597040,210.0,40975,684,62.0,17598718,114.0,18004655,22931,4818457.0,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,92.0,116347040,3486739,674700,35223547.0,208339025.0,156380351,188.0,40778,3,67.0,10422430,111.0,7780028,28444,6623075.0,1118279,,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538.4,601309283,54331,211607669,92.0,122599116,2228730,3025400,275154237.0,3369120610.0,373784955,190.0,74333,536,136.0,36321847,172.0,5022621,5639,7208651.0,5285340,,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444.9,2031280633,269802,136569078,85.0,1096100899,10629796,7189811,1078757968.0,14603725994.0,3351188582,394.0,1474799,2182,264.0,24684248,210.0,190260277,203384,,11822942,,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423.3,107034922,7223,151469874,88.0,77373957,3670188,16400,,,112763851,182.0,12185,1,82.0,17660624,105.0,4493884,7006,207179.0,457017,,1


In [None]:
spotify_data.drop(['Track', 'Album Name', 'Artist', 'Release Date', 'ISRC','All Time Rank', 'TIDAL Popularity'], axis=1, inplace=True)

In [None]:
spotify_d_clean = spotify_data.replace({',': ''}, regex=True)
for col in spotify_d_clean.columns[5:]:
    if spotify_d_clean[col].dtype == 'object':
        spotify_d_clean[col] = spotify_d_clean[col].astype(float)
spotify_d_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track Score                 4600 non-null   float64
 1   Spotify Streams             4487 non-null   object 
 2   Spotify Playlist Count      4530 non-null   object 
 3   Spotify Playlist Reach      4528 non-null   object 
 4   Spotify Popularity          3796 non-null   float64
 5   YouTube Views               4292 non-null   float64
 6   YouTube Likes               4285 non-null   float64
 7   TikTok Posts                3427 non-null   float64
 8   TikTok Likes                3620 non-null   float64
 9   TikTok Views                3619 non-null   float64
 10  YouTube Playlist Reach      3591 non-null   float64
 11  Apple Music Playlist Count  4039 non-null   float64
 12  AirPlay Spins               4102 non-null   float64
 13  SiriusXM Spins              2477 

In [None]:
object_columns = spotify_d_clean.select_dtypes(include=['object']).columns
spotify_d_clean[object_columns] = spotify_d_clean[object_columns].apply(pd.to_numeric, errors='coerce')
spotify_d_clean.fillna(spotify_d_clean.median(), inplace = True)

### 2. Visualizing Data Distributions


In [68]:
fig = make_subplots(rows = 1, cols = 2)
columns = ["Spotify Streams", "YouTube Views"]
colors = ["orange", "lightblue"]

for index, column in enumerate(columns):
    fig.add_trace(
        go.Histogram(
            x = spotify_d_clean[column],
            marker = dict(
                color = colors[index],
                line = dict(
                    color = "black",
                    width = 0.2
                )
            ),
            name = column
        ),
        row = 1, col = index + 1,
        
    )

fig.update_layout(
    template = "plotly_dark",
    title = "Histograma de Spotify Streams e YouTube Views",
)
fig.show()

The graphs for both $\color{green} \text{Spotify Streams}$ and $\color{green} \text{YouTube Views}$ look like a log-normal distribution with a high $\sigma$(standard deviation) value. The 'long tail', both for Spotify and YouTube, shows the difference in the proportion between the amount of songs with low popularity and high popularity. 

Specifically, as the number of streams or views increases, the frequency of songs decreases rapidly, resulting in only a small fraction of songs achieving exceptionally high popularity. The sharp decline of streams after reaching half a billion, highlights the unequeal nature of success within the music industry. 


### 3. Measures of Central Tendency

In [69]:
print(f"Spotify Streams - Media: {spotify_d_clean['Spotify Streams'].mean(): .2f}, Mediana: {spotify_d_clean['Spotify Streams'].median(): .2f}")
print(f"YouTube Views- Media: {spotify_d_clean['YouTube Views'].mean(): .2f}, Mediana: {spotify_d_clean['YouTube Views'].median(): .2f}")

Spotify Streams - Media:  442289133.18, Mediana:  239850720.00
YouTube Views- Media:  385754478.40, Mediana:  148269610.00


The difference between the mean and the median, has to do with the sensitivity the mean has for the skewness. Since both graphs has a skewness to the right, the mean is pulled by those extreme values (more streams), and thus, increasing more than the value for the median. The median remains closer to the central cluster of observations, offering a more representative insight of the distribution. 

4. Measures of Central Tendency

In [70]:
columns_measures = ["Spotify Streams", "YouTube Views", "Spotify Popularity"]
data_measures = spotify_d_clean[columns_measures]

In [71]:
for column in columns_measures:
    mean = data_measures[column].mean()
    std = data_measures[column].std()
    cv = std / mean
    print("-"*len(data_measures[column]))
    print(f"{column} - Deviación estándar: {std: .2f}, Coeficiente de variación: {cv: .2f}")

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

The most consistent measure is the *Spotify Popularity*, since both their standard deviation and the coefficient of variation shows that they cluster closer to their popularity score, suggesting a more stable behavior. 

The next more consistent but with a much more volatility, is the *Spotify Streams*. Although most songs receive a comparable number of streams, there exists a notable subset with disproportionately high stream counts, creating dispersion.

The most volatile are the *YouTube Views*, since its standard deviation goes as high as 680968100.19 and the coefficient of variation 1.77, which represents a drastic increase from the *Spotify Popularity* numbers. 

# Parte 2 


### Defining Events

In [72]:
spotify_d_clean["high_popularity"] = spotify_d_clean["Spotify Popularity"].map(lambda row: "The song has High Popularity" if row >= 85 else " The song has Low Popularity")

In [73]:
spotify_d_clean["is_explicit"] = spotify_d_clean["Explicit Track"].map(lambda row: "The song is Explicit" if row == 1 else "The song is Not Explicit") 

### 2. Calculating Prior Probabilities

In [74]:
p_a = sum(spotify_d_clean["high_popularity"] == "The song has High Popularity") / len(spotify_d_clean["high_popularity"]) 
p_b = sum(spotify_d_clean["is_explicit"] == "The song is Explicit") / len(spotify_d_clean["is_explicit"])
p_a_and_b = sum((spotify_d_clean["high_popularity"] == "The song has High Popularity") & (spotify_d_clean["is_explicit"] == "The song is Explicit")) / len(spotify_d_clean["high_popularity"])

p_a_given_b = p_a_and_b / p_b 

In [75]:
print(f"P(A) = {p_a: .4f}")
print(f"P(B) = {p_b: .4f}")
print(f"P(A | B) = {p_a_given_b: .4f}")

P(A) =  0.0117
P(B) =  0.3589
P(A | B) =  0.0109


### 3. Applying Bayes' Theorem

In [76]:
p_b_given_a = p_a_and_b / p_a

In [77]:
print(f"P(B | A) = {p_b_given_a: .4f}")

P(B | A) =  0.3333
