# Procesamiento de datos de spotify
En este notebook se realizará la consulta y procesamiento de los datos que serán utilizados para el proyecto

In [1]:
import glob
import pandas as pd
import numpy as np

In [2]:
path = "user_streaming_history"
csv_files = glob.glob(path + "/*.json")

# Read each CSV file into DataFrame
# This creates a list of dataframes
df_list = (pd.read_json(file) for file in csv_files)

# Concatenate all DataFrames
big_df   = pd.concat(df_list, ignore_index=True)

In [3]:
big_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51189 entries, 0 to 51188
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   ts                                 51189 non-null  object
 1   username                           51189 non-null  object
 2   platform                           51189 non-null  object
 3   ms_played                          51189 non-null  int64 
 4   conn_country                       51189 non-null  object
 5   ip_addr_decrypted                  51189 non-null  object
 6   user_agent_decrypted               51189 non-null  object
 7   master_metadata_track_name         51186 non-null  object
 8   master_metadata_album_artist_name  51186 non-null  object
 9   master_metadata_album_album_name   51186 non-null  object
 10  spotify_track_uri                  51186 non-null  object
 11  episode_name                       3 non-null      object
 12  epis

## Transformación y limpieza del dataset original

In [4]:
big_df[["ts"]] = big_df[["ts"]].apply(pd.to_datetime)
big_df["year_played"] = big_df["ts"].dt.year
big_df["month_played"] = big_df["ts"].dt.month
big_df["min"] = big_df["ms_played"]/60000
big_df=big_df.rename(columns={"ts": "timestamp", "conn_country": "country", "master_metadata_track_name": "song_name", "master_metadata_album_artist_name": "artist_name"})
column_filtered_df = big_df[["timestamp","year_played","month_played","min","country", "song_name", "artist_name","spotify_track_uri"]]

In [5]:
row_filtered_df = column_filtered_df.dropna()
row_filtered_df = row_filtered_df.loc[row_filtered_df["year_played"]==2023]
row_filtered_df = row_filtered_df.loc[row_filtered_df["min"]>0]

In [26]:
row_filtered_df

Unnamed: 0,timestamp,year_played,month_played,min,country,song_name,artist_name,spotify_track_uri
2448,2023-01-01 03:03:57+00:00,2023,1,3.715267,CO,Diciembre,Eslabon Armado,spotify:track:5kPEpa2Z7jWjfA6YMq9CGE
2449,2023-01-01 03:07:12+00:00,2023,1,3.231250,CO,Buena Vibra,Jose Villarreal,spotify:track:6Y2qNukfWKFGbAv2iONhvy
2450,2023-01-01 03:10:08+00:00,2023,1,2.921600,CO,El Gavilán,Luis R Conriquez,spotify:track:4ygQ5wSuzzNRevVjup1qV2
2451,2023-01-01 03:12:41+00:00,2023,1,2.533350,CO,4:00 A.M.,Marco Soriano,spotify:track:3ltvVEGIsoN9FXHmrfIclp
2452,2023-01-01 03:15:44+00:00,2023,1,3.031917,CO,Amanecer,Absa G.,spotify:track:5w5TXv6P9WNf1bnlu7nbqK
...,...,...,...,...,...,...,...,...
51183,2023-09-01 05:45:55+00:00,2023,9,0.047800,CO,Super Smash Bros Brawl Drill Beat (Life Ain't ...,Shae OT,spotify:track:73PPGDE65hW7SaZ1zO5mud
51185,2023-09-01 05:46:00+00:00,2023,9,0.061083,CO,HOLIDAY,Lil Nas X,spotify:track:6zFMeegAMYQo0mt8rXtrli
51186,2023-09-01 05:46:08+00:00,2023,9,0.127717,CO,Dont Like,Rkm Legend,spotify:track:3aopgnjfN8lBngYzXinNyx
51187,2023-09-01 05:46:45+00:00,2023,9,0.622500,CO,6IXSPEED,7oh2,spotify:track:56Kb55Pzodbj5cGJcyZW1W


## Agregación por mes

In [6]:
df_by_month = pd.pivot_table(
   row_filtered_df,
   index=["country","month_played"],
   aggfunc={'min': np.sum, 'country': len}
).rename(columns={'country': 'count'})
df_by_month.sort_values(by=['min', 'count'],ascending=False, inplace=True)
df_by_month.reset_index(inplace=True)

In [7]:
df_by_month.head(12)

Unnamed: 0,country,month_played,count,min
0,CO,5,6029,5773.560783
1,CO,8,4560,5004.908217
2,CO,1,4480,4742.0083
3,CO,6,3082,4125.589583
4,CO,4,3590,4003.57965
5,CO,7,3446,3871.57155
6,CO,10,3049,3808.322417
7,CO,3,2890,3447.676717
8,CO,9,3148,3401.5362
9,CO,12,2188,3198.0873


## Agregación de registros por canción

In [8]:
df_by_song = pd.pivot_table(
   row_filtered_df,
   index=["country","song_name", "artist_name"],
   aggfunc={'min': np.sum, 'country': len}
).rename(columns={'country': 'count'})
df_by_song.sort_values(by=['min', 'count'],ascending=False, inplace=True)
df_by_song.reset_index(inplace=True)

In [9]:
df_by_song.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11338 entries, 0 to 11337
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   country      11338 non-null  object 
 1   song_name    11338 non-null  object 
 2   artist_name  11338 non-null  object 
 3   count        11338 non-null  int64  
 4   min          11338 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 443.0+ KB


In [10]:
df_by_song_month = pd.pivot_table(
   row_filtered_df,
   index=["country","song_name", "artist_name", "month_played"],
   aggfunc={'min': np.sum, 'country': len}
).rename(columns={'country': 'count'})
df_by_song_month.sort_values(by=['min', 'count'],ascending=False, inplace=True)
df_by_song_month.reset_index(inplace=True)

In [11]:
df_by_song_month.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21885 entries, 0 to 21884
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country       21885 non-null  object 
 1   song_name     21885 non-null  object 
 2   artist_name   21885 non-null  object 
 3   month_played  21885 non-null  int64  
 4   count         21885 non-null  int64  
 5   min           21885 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 1.0+ MB


## Agregación de registros por artista

In [12]:
df_by_artist = pd.pivot_table(
   row_filtered_df,
   index=["country", "artist_name"],
   aggfunc={'min': np.sum, 'country': len}
).rename(columns={'country': 'count'})
df_by_artist.sort_values(by=['min', 'count'],ascending=False, inplace=True)
df_by_artist.reset_index(inplace=True)

In [13]:
df_by_artist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4118 entries, 0 to 4117
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   country      4118 non-null   object 
 1   artist_name  4118 non-null   object 
 2   count        4118 non-null   int64  
 3   min          4118 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 128.8+ KB


In [14]:
df_by_artist_month = pd.pivot_table(
   row_filtered_df,
   index=["country", "artist_name","month_played"],
   aggfunc={'min': np.sum, 'country': len}
).rename(columns={'country': 'count'})
df_by_artist_month.sort_values(by=['min', 'count'],ascending=False, inplace=True)
df_by_artist_month.reset_index(inplace=True)

In [15]:
df_by_artist_month.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10330 entries, 0 to 10329
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country       10330 non-null  object 
 1   artist_name   10330 non-null  object 
 2   month_played  10330 non-null  int64  
 3   count         10330 non-null  int64  
 4   min           10330 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 403.6+ KB


In [18]:
import sys
!{sys.executable} -m pip install spotipy

Collecting spotipy
  Downloading spotipy-2.24.0-py3-none-any.whl (30 kB)
Collecting redis>=3.5.3
  Downloading redis-5.0.6-py3-none-any.whl (252 kB)
     -------------------------------------- 252.0/252.0 kB 2.2 MB/s eta 0:00:00
Collecting async-timeout>=4.0.3
  Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Installing collected packages: async-timeout, redis, spotipy
Successfully installed async-timeout-4.0.3 redis-5.0.6 spotipy-2.24.0


In [47]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
client_id = ""
client_secret = ""
auth_manager = SpotifyClientCredentials(client_id=client_id,client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

def get_track_complementaries(track_uri:str):
    track = sp.track(track_uri)
    artist_uri = None
    artists = track.get("artists")
    if artists:
        artist_uri = artists[0].get("uri")
    popularity = track.get("popularity")
    release_date = track.get("album.release_date")
    release_precision = track.get("album.release_date_precision")
    return (artist_uri,popularity,release_date,release_precision)

def get_artist_complementaries(artist_uri:str):
    artist = sp.artist(artist_uri)
    genre = None
    genres = artist.get("genres")
    if genres:
        genre = genres[0]
    popularity = artist.get("popularity")
    return (genre, popularity)

def get_complementaries(track_uri:str):
    artist_uri, track_pop, rel_date, rel_date_precision = get_track_complementaries(track_uri)
    genre, artist_pop = get_artist_complementaries(artist_uri)
    return (genre, rel_date, track_pop, artist_pop)

In [50]:
genre, rel_date, track_pop, artist_pop = get_complementaries("spotify:track:5w5TXv6P9WNf1bnlu7nbqK")
print(genre)
print(rel_date)

mexican hip hop
