<a href="https://colab.research.google.com/github/mayait/ClaseAnalisisDatos/blob/main/python101/spotify_parte2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Vamos a conectarnos a un API externo para procesar datos en vivo de Spotify, usaremos *pandas* para serializar los datos y luego vamos a exportarlos a Excel



**Crea tu API**
1. Registrate en Spotify para obtener tus credenciales de API 
https://developer.spotify.com/dashboard/
2. Create an APP y asignale un nombre
3. Copia tus credenciales de spotify el client_ID y el SECRET





In [None]:
#@title Tus credenciales de spotify

client_id = "" #@param {type:"string"}
client_secret = "" #@param {type:"string"}


**Ahora ve a Spotify y averigua cual es tu usuario**
https://www.spotify.com/us/account/overview/

*PRO TIP: Prueba el usuario "spotify" tiene listas increibles!!*


In [None]:
#@title Ingresa tu usuario de Spotify
mi_username = "spotify" #@param {type:"string"}

In [None]:
#@title Ejecuta el setup


# Instala SPOTIPY
# pip es una herramienta que importa librerias externas que no tengas instaladas
! pip install spotipy

# Vamos a utilizar todas estas librerias
import spotipy
spotify = spotipy.Spotify()
import sys
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
import requests
import lxml
import json

# Librerías para visualización
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt

# Hay diferentes temas de graficos https://matplotlib.org/3.5.1/gallery/style_sheets/style_sheets_reference.html
matplotlib.style.use('fast') 

In [None]:
#@title Funciones para conectarse a Spotify

def get_spotify_credentials():
    # Esta funcion conecta spotify con tus credenciales
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    sp.trace = True
    return sp

def get_spotify_data(artist_name):

    # obtner la autorizaciones con espotify
    sp = get_spotify_credentials()

    # Busquemos el artista
    results = sp.search(q='artist:' + artist_name, type='artist')
    items = results['artists']['items']
    if len(items) > 0:
        artist = items[0]

    uri = artist['uri']

    # Obtengamos los albums
    results = sp.artist_albums(uri, album_type='album')
    albums = results['items']

    while results['next']:
        results = sp.next(results)
        albums.extend(results['items'])

    uris = []
    track_names = []
    album_names = []

    # Saquemos info de los tracks de los albums
    for album in albums:
        for t in sp.album(album['uri'])['tracks']['items']:
            uris.append(t['uri'])
            track_names.append(t['name'])
            album_names.append(album['name'])
    features = []
    for i in range(len(uris)// 100 + 1):
        fs = sp.audio_features(uris[i*100:min((i+1)*100, len(uris))])
        if fs[0] is not None:
            features.extend(fs)
    print(features)

    # Convirtamos esto a un DataFrame de Pandas!
    dat = pd.DataFrame(features)
    dat['track_name'] = track_names
    dat['album'] = album_names
    dat['artists'] = artist_name


    dat.set_index('track_name', inplace=True)
    dat.drop_duplicates(inplace=True)
    dat = dat[~dat.index.duplicated(keep='first')]

    return dat

In [None]:
#@title Función obtener playlists

def get_user_playlist(username=mi_username, sp=get_spotify_credentials()):
    playlists = sp.user_playlists(username) # Obtener todos los playlist
    while playlists['next']:
        for playlist in playlists['items']:
            print("ID: {} \t Canciones {}\t  Nombre: {} ".
                  format(playlist['id'], playlist['tracks']['total'], playlist['name']))
        playlists = sp.next(playlists)

# También podemos definir una función que obtenga los datos de la lista de reproducción de un usuario.
def get_spotify_playlist_data(username, playlist_id):

    track_number_limit = 1000     # Pongamos un limite de tracks a Analizar

    # De nuevo hagamos login en spotify
    # Ven lo util de hacer funciones, no tengo que volverlo a escribir todo!!!!
    sp = get_spotify_credentials()

    results = sp.user_playlist(username, playlist_id, fields="tracks,next")['tracks']
    
    # Mira los resultados que vienen del API
    # print(json.dumps(results, indent=4)) # Comenta esta linea despues

    tracks = results['items']
    while results['next'] and len(tracks) < track_number_limit:
        results = sp.next(results)
        if results['items'][0] is not None:
            tracks.extend(results['items'])

    ts = []
    track_names = []

    for t in tracks:
        track = t['track']
        track['album'] = track['album']['name']
        track_names.append(t['track']['name'])
        artists = []
        for a in track['artists']:
            artists.append(a['name'])
        track['artists'] = ', '.join(artists)
        ts.append(track)

    uris = []
    dat = pd.DataFrame(ts)

    dat.drop(['available_markets', 'disc_number', 'external_ids', 'external_urls'], axis=1, inplace=True)

    features = []

    for i in range(len(dat)// 100 + 1):
        fs = sp.audio_features(dat.uri.iloc[i*100:min((i+1)*100, len(dat))])
        if fs[0] is not None:
            features.extend(fs)

    fs = pd.DataFrame(features)

    dat = pd.concat([dat, fs], axis=1)
    dat['track_name'] = track_names


    dat.set_index('track_name', inplace=True)
    dat = dat[~dat.index.duplicated(keep='first')]
    dat = dat.T[~dat.T.index.duplicated(keep='first')].T

    return dat

In [None]:
# Busquemos canciones de algun artista y guardemoslo en una variable
canciones_de_artista = get_spotify_data('Queen')

In [None]:
# Veamos las columnas que tenemos!
print(canciones_de_artista.shape[0], "canciones")
print(canciones_de_artista.columns)

In [None]:
canciones_de_artista.sample(3)

In [None]:
# Exportemos el dataframe a Excel
canciones_de_artista.to_excel(r'canciones.xlsx', sheet_name='canciones', index = False)

# Canciones en una playlist

**Awsome!!!**

Como era de esperar, podemos ver que para cada pista hemos obtenido las propiedades acústicas deseadas (junto con alguna información sobre su ubicación en la base de datos de Spotify). 



**Encuentra el ID del playlist que quieres analizar entre todos tus playlists**
La siguiente función imprime tus playlists y los id de cada lista, copia en el portapapeles el ID de la playlist que quieres analizar

In [None]:
# Esta función suele obtener tus playlists
get_user_playlist()

Voy a utilizar el playlist "This is Debussy"
37i9dQZF1DWXnRw5VvmZDb

Puedes ver un playlist en spotify usando el ID de la siguiente forma:
**https://open.spotify.com/playlist/37i9dQZF1DWXnRw5VvmZDb**

In [None]:
#@title Selecciona tu playlist
playlist_A_id = "37i9dQZF1DWXnRw5VvmZDb" #@param {type:"string"}
playlist_A_shortname = "debussy" #@param {type:"string"}
playlist_B_id = "37i9dQZF1DZ06evO4m87u0" #@param {type:"string"}
playlist_B_shortname = "TheBlackKeys" #@param {type:"string"}


In [None]:
playlist_A = get_spotify_playlist_data(username=mi_username, playlist_id=playlist_A_id)
playlist_B = get_spotify_playlist_data(username=mi_username, playlist_id=playlist_B_id)

# Mira el formato en que viene la información del API, esto es un formato JSON, es una estructura pareccida a un diccionario.

In [None]:
playlist_A.shape

In [None]:
playlist_B.shape

In [None]:
print(playlist_A.columns)

**Pasemos esto a Excel y luego a Tableau y hagamos hagamos un Dashboard de nuestro Playlist**

In [None]:
playlist_A.to_excel(playlist_A_shortname + '.xlsx', sheet_name='playlist_A', index = False)
playlist_B.to_excel(playlist_B_shortname + '.xlsx', sheet_name='playlist_B', index = False)

You can find below the explanation of each feature ( past/copy from the Spotify website).

**Acousticness:** A confidence measure from 0.0 to 1.0 of whether the track is 
acoustic. 1.0 represents high confidence the track is acoustic.
Danceability: Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.

**Energy:** Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.

**Instrumentalness:** Predicts whether a track contains no vocals. “Ooh” and “aah” sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly “vocal”. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.

**Liveness:** Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides a strong likelihood that the track is live.
Loudness: the overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db.

**Speechiness:** Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audiobook, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks.

**Valence:** A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

**Tempo:** The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.

In [None]:
# Revisemos los tipos de datos que tenemos
playlist_A.dtypes

**Convierte en decimal las columnas que vamos a procesar**

In [None]:
# Convierte en decimal las columnas que vamos a procesar
playlist_A['duration_ms'] = playlist_A['duration_ms'].astype(float)
playlist_A['popularity'] = playlist_A['popularity'].astype(float)
playlist_A['danceability'] = playlist_A['danceability'].astype(float)
playlist_A['energy'] = playlist_A['energy'].astype(float)
playlist_A['loudness'] = playlist_A['loudness'].astype(float)
playlist_A['mode'] = playlist_A['mode'].astype(float)
playlist_A['speechiness'] = playlist_A['speechiness'].astype(float)
playlist_A['acousticness'] = playlist_A['acousticness'].astype(float)
playlist_A['instrumentalness'] = playlist_A['instrumentalness'].astype(float)
playlist_A['valence'] = playlist_A['valence'].astype(float)
playlist_A['tempo'] = playlist_A['tempo'].astype(float)

# Convierte en decimal las columnas que vamos a procesar
playlist_B['duration_ms'] = playlist_B['duration_ms'].astype(float)
playlist_B['popularity'] = playlist_B['popularity'].astype(float)
playlist_B['danceability'] = playlist_B['danceability'].astype(float)
playlist_B['energy'] = playlist_B['energy'].astype(float)
playlist_B['loudness'] = playlist_B['loudness'].astype(float)
playlist_B['mode'] = playlist_B['mode'].astype(float)
playlist_B['speechiness'] = playlist_B['speechiness'].astype(float)
playlist_B['acousticness'] = playlist_B['acousticness'].astype(float)
playlist_B['instrumentalness'] = playlist_B['instrumentalness'].astype(float)
playlist_B['valence'] = playlist_B['valence'].astype(float)
playlist_B['tempo'] = playlist_B['tempo'].astype(float)



**Juntemos todas las playlist en una sola base**

In [None]:
playlist_A['playlist'] = playlist_A_shortname
playlist_B['playlist'] = playlist_B_shortname

playlist_juntas = pd.concat([playlist_A, playlist_B])

In [None]:
playlist_juntas.sample(6)

In [None]:
# Veamos el histograma de dos valores
playlist_A.hist(column='danceability', bins=12, grid=True, figsize=(10,8), color='#1DB954', zorder=2, rwidth=0.9)
plt.title("Bailabilidad de " + playlist_A_shortname)

In [None]:
playlist_B.hist(column='danceability', bins=12, grid=True, figsize=(10,8), color='red', zorder=2, rwidth=0.9)
plt.title("Bailabilidad de " + playlist_B_shortname)



In [None]:
# Usemos seaborn para combinar varios histogramas

sns.distplot(playlist_A['valence'],  kde=True, label=playlist_A_shortname)
sns.distplot(playlist_B['valence'], kde=True, label=playlist_B_shortname)

plt.title("Comparación de la valencia musical")

plt.ylabel("Frecuencia Relativa")
plt.legend()


plt.show()

In [None]:
g = sns.FacetGrid(playlist_juntas, height=6, aspect=1)
g.map_dataframe(sns.scatterplot, x="valence", y="energy", hue="playlist")
g.add_legend()

In [None]:
g = sns.FacetGrid(playlist_juntas,hue='playlist',palette='coolwarm',size=4,aspect=1)

g.map(plt.hist,'valence',bins=20,alpha=0.7,)

In [None]:
playlist_A.boxplot(column=['valence', 'danceability', 'energy', 'speechiness', 'acousticness', ],figsize=(15,3));

In [None]:
playlist_A.boxplot(column=['valence', 'danceability', 'energy', 'speechiness', 'acousticness', ],figsize=(15,3));

In [None]:
playlist_B.boxplot(column=['valence', 'danceability', 'energy', 'speechiness', 'acousticness', ],figsize=(15,3));

**Exploremos las medias de cada caracteristica**

In [None]:
plt.title("Media de las caracteristicas musicales de " + playlist_A_shortname)
playlist_A[columnas].mean().plot(kind='bar')

In [None]:
plt.title("Media de las caracteristicas musicales de " + playlist_B_shortname)
playlist_B[columnas].mean().plot(kind='bar')

**¿Puedo hacer un resumen de la media de cada una de las caracteristicas de los playlist?**

In [None]:
# Función para calcular la media de todas las casracteristicas de la playlist

def playlist_features_to_mean(playlist, playlist_name):
  # Podemos hacer una lista de todas las columnas que queremos obtener sumatorias
  columnas = ['valence', 'speechiness', 'instrumentalness', 'energy', 'danceability', 'acousticness']
  # Calculamos las medianas de todas las columnas
  # Transponemos los valores, columnas en filas
  mean_dataframe = playlist[columnas].mean().to_frame().transpose()
  # Añadimos la columna playlist para poder comparar
  mean_dataframe['playlist'] = playlist_name

  return mean_dataframe

In [None]:
playlist_A_means = playlist_features_to_mean(playlist_A, playlist_A_shortname)

In [None]:
playlist_B_means = playlist_features_to_mean(playlist_B, playlist_B_shortname)

In [None]:
playlist_A_means

In [None]:
playlist_B_means

**¿Puedo juntar las dos dataframes?**

In [None]:
# Podemos juntar las dos medidas en un mismo dataframe
all_means = pd.concat([playlist_A_means,playlist_B_means])
all_means

In [None]:
# el indice de las filas deberia ser el playlist
all_means = all_means.set_index('playlist')

In [None]:
all_means.plot(kind='bar', figsize=(10,8))
plt.legend(loc='upper left', ncol=1, title="Caracteristica", bbox_to_anchor=(1.0, 0.5))
plt.title("Medias de las caracteristicas musicales")
plt.xlabel("Playlist")
plt.ylabel("Media")
plt.plot()

**Podemos usar transpose para cambiar los ejes de la tabla y del chart**

In [None]:
all_means.transpose()

In [None]:
all_means.transpose().plot(kind='bar')
plt.legend(loc='upper left', ncol=1, title="Caracteristica", bbox_to_anchor=(1.0, 0.5))
plt.plot()

# Ejercicio
Unifica todos los **playlists** en un unico playlit llamado **allplaylist** con una nueva columna que tenga el nombre del playlist

In [None]:
allplaylist =

In [None]:

sns.boxplot(y=allplaylist['valence'], x=allplaylist['playlist']) 

plt.show()

In [None]:
ax = sns.boxplot(data=allplaylist, x=allplaylist['popularity'], orient="h", y="playlist")

In [None]:
allplaylist.boxplot(column=['popularity', ], by='playlist', figsize=(10,6),  fontsize=12);
plt.show()

In [None]:

allplaylist.boxplot(column=['energy', 'acousticness','valence', 'danceability' ], by='playlist', figsize=(30,9) );
plt.show()

In [None]:
# Un scatter puede tener una tercera variable
plt.figure(figsize=(10,10), dpi=120)

allplaylist.plot.scatter(  x='valence',
                          y='danceability',
                         c='popularity',
                         by='playlist',
                                colormap='viridis',
                        
                              )

In [None]:
plt.figure(figsize=(10,10), dpi=120)

sns.scatterplot(x="energy", y="popularity", data=allplaylist, hue="playlist", style="playlist")
