# Procesamiento de datos de spotify
En este notebook se realizará la consulta y procesamiento de los datos que serán utilizados para el proyecto

In [1]:
import glob
import pandas as pd
import numpy as np

In [2]:
path = "user_streaming_history"
csv_files = glob.glob(path + "/*.json")

# Read each CSV file into DataFrame
# This creates a list of dataframes
df_list = (pd.read_json(file) for file in csv_files)

# Concatenate all DataFrames
big_df   = pd.concat(df_list, ignore_index=True)

In [3]:
big_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51189 entries, 0 to 51188
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   ts                                 51189 non-null  object
 1   username                           51189 non-null  object
 2   platform                           51189 non-null  object
 3   ms_played                          51189 non-null  int64 
 4   conn_country                       51189 non-null  object
 5   ip_addr_decrypted                  51189 non-null  object
 6   user_agent_decrypted               51189 non-null  object
 7   master_metadata_track_name         51186 non-null  object
 8   master_metadata_album_artist_name  51186 non-null  object
 9   master_metadata_album_album_name   51186 non-null  object
 10  spotify_track_uri                  51186 non-null  object
 11  episode_name                       3 non-null      object
 12  epis

## Transformación y limpieza del dataset original

In [4]:
big_df[["ts"]] = big_df[["ts"]].apply(pd.to_datetime)
big_df["year_played"] = big_df["ts"].dt.year
big_df["month_played"] = big_df["ts"].dt.month
big_df["min"] = big_df["ms_played"]/60000
big_df=big_df.rename(columns={"ts": "timestamp", "conn_country": "country", "master_metadata_track_name": "song_name", "master_metadata_album_artist_name": "artist_name"})
column_filtered_df = big_df[["timestamp","year_played","month_played","min","country", "song_name", "artist_name","spotify_track_uri"]]

In [60]:
row_filtered_df = column_filtered_df.dropna()
row_filtered_df = row_filtered_df.loc[row_filtered_df["year_played"]==2023]
row_filtered_df = row_filtered_df.loc[row_filtered_df["min"]>(30/60)]

## Integración API

In [6]:
import sys
!{sys.executable} -m pip install spotipy



In [35]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
client_id = ""
client_secret = ""
auth_manager = SpotifyClientCredentials(client_id=client_id,client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

def get_complementaries(tracks_uris):
    tracks_df = pd.DataFrame(columns = ["track_uri","artist_uri","year","track_pop"])
    artists_df = pd.DataFrame(columns = ["artist_uri","genre","artist_pop"])
    artist_uris = set()
    tracks_uris_chunks = [tracks_uris[x:x+30] for x in range(0, len(tracks_uris), 30)]
    for tracks_uris_chunk in tracks_uris_chunks:
        tracks = sp.tracks(tracks_uris_chunk)
        for track in tracks.get("tracks"):
            track_uri = track.get("uri")
            artist_uri = None
            artists = track.get("artists")
            if artists:
                artist_uri = artists[0].get("uri")
                artist_uris.add(artist_uri)
            popularity = track.get("popularity")
            album = track.get("album")
            release_date = album.get("release_date")
            year = release_date[0:4] if release_date else None
            tracks_df = pd.concat([tracks_df, pd.Series({"track_uri":track_uri,"artist_uri":artist_uri,"year":year,"track_pop":popularity}).to_frame().T], ignore_index=True)
    artist_uris = list(artist_uris)
    artists_uris_chunks = [artist_uris[x:x+30] for x in range(0, len(artist_uris), 30)]
    for artists_uris_chunk in artists_uris_chunks:
        artists = sp.artists(artists_uris_chunk)
        for artist in artists.get("artists"):
            artist_uri = artist.get("uri")
            genres = artist.get("genres")
            genre = genres[0] if genres else None
            popularity = artist.get("popularity")
            artists_df = pd.concat([artists_df, pd.Series({"artist_uri":artist_uri,"genre":genre,"artist_pop":popularity}).to_frame().T], ignore_index=True)
    result = pd.merge(tracks_df, artists_df, on="artist_uri")
    return result

In [27]:
track_uri_list = row_filtered_df.groupby(["song_name", "artist_name"])["spotify_track_uri"].first().reset_index()
track_uri_list = track_uri_list.rename(columns={"spotify_track_uri":"track_uri"})

In [36]:
test_df = track_uri_list.head(300)
uris=list(test_df["track_uri"])
aux_df = get_complementaries(uris)

In [37]:
pd.merge(test_df, aux_df, on="track_uri")

Unnamed: 0,song_name,artist_name,track_uri,artist_uri,year,track_pop,genre,artist_pop
0,!Viva el Mal Viva el Capital!,Elektroduendes,spotify:track:6LrefSCg9Vnn1ckCeJkmbA,spotify:artist:7oboTHVarRYloS0WJxGSJW,2004,4,spanish punk,7
1,"""COSAS DEL QUERER""",Rxnde Akozta,spotify:track:6vbUg2l6GvTwCMqnalorz0,spotify:artist:5YO3yQx97KLLE9zJYa4jaU,2023,42,boom bap espanol,42
2,"""F""",MAXIMUM THE HORMONE,spotify:track:51br5d5nIel2Dm0PNP5X4N,spotify:artist:3AMut7lAb1JjINkn8Fmkhu,2013,45,japanese metalcore,51
3,"""Liar, Liar""",The Castaways,spotify:track:4ggveFmRU0REGh6bVwxHqg,spotify:artist:3PbCnPQUMUHJXAI9g2Ld7V,1965,24,beach music,28
4,#Ysya2020 Vol. 5 - Silbando,YSY A,spotify:track:6TI490muJ6RgFsYJc626BZ,spotify:artist:2qWK8K2Jfh67UqtwY8tCW6,2020,63,argentine hip hop,74
...,...,...,...,...,...,...,...,...
295,Abriendo Trocha,Kiño,spotify:track:7GHu2XWZvQP7B3Ps5xzVlv,spotify:artist:4mSxhskrhEcz1mZNVf1GWa,2019,16,colombian hip hop,42
296,Abrázame Muy Fuerte,Juan Gabriel,spotify:track:2nejvFyJeTDtMRP2nUMt0J,spotify:artist:2MRBDr0crHWE5JwPceFncq,2000,67,cancion melodica,73
297,Absentee,Cass McCombs,spotify:track:23fEZLdzHoZlNZZu4E1pSp,spotify:artist:2iUVQjheBnvOt8vaBrxXJz,2019,51,baroque pop,43
298,Absinthe and Rue,Symphony X,spotify:track:77nnNbzXAAY3vRPwUMfioM,spotify:artist:4MnZkh4dpNmTMPxkl4Ev5L,1994,11,neo classical metal,44


## Agregación de registros por canción

In [64]:
df_by_song = pd.pivot_table(
   row_filtered_df,
   index=["country","song_name", "artist_name"],
   aggfunc={'min': np.sum, 'country': len}
).rename(columns={'country': 'count'})
df_by_song_month.sort_values(by=['min', 'count'],ascending=False, inplace=True)
df_by_song_month.reset_index(inplace=True)

In [65]:
df_by_song.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5168 entries, ('CO', '!Viva el Mal Viva el Capital!', 'Elektroduendes') to ('CO', '黄昏のBAY CITY', 'Junko Yagami')
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   count   5168 non-null   int64  
 1   min     5168 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 162.5+ KB


In [66]:
df_by_song

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,min
country,song_name,artist_name,Unnamed: 3_level_1,Unnamed: 4_level_1
CO,!Viva el Mal Viva el Capital!,Elektroduendes,2,4.039000
CO,#Ysya2020 Vol. 5 - Silbando,YSY A,1,0.602483
CO,'Merican,Descendents,2,3.706667
CO,(It Goes Like) Nanana - Edit,Peggy Gou,11,32.939850
CO,(Sittin' On) the Dock of the Bay,Otis Redding,1,2.729250
CO,...,...,...,...
CO,自爆,Utsu-P,2,2.219700
CO,遥か彼方,ASIAN KUNG-FU GENERATION,9,20.658300
CO,銀翼の凶星 〜 バルファルク,裏谷玲央,1,2.435317
CO,閃光,[Alexandros],1,0.960850
