# Objetivo

O objetivo final aqui é gerar o csv que vai servir pro dashboard no Tableau

O que eu quero é:
1. Pegar os dados que vieram classificados do label studio
2. Organizar o user_data levando em conta a tag dos tracks
3. Finalizar com um único csv tendo toda a informação que preciso

Para isso vou:
1. Organizar o JSON que agrupa as tags em generos
2. Organizar o JSON que classifica o genero das musicas que nao tinham sido encontradas no LAST FM
3. Substituir as tags de cada música pelo genêro
4. Sobreescrever as tags de todas musicas de cada artista pelo genero caso esse tenha sido classificado em "classified_artists.json"
5. Juntar tudo num csv compacto para exportar

In [1]:
import pandas as pd
import sqlite3
import json

In [2]:
conn = sqlite3.connect('music.db')

## Dados gerais

In [3]:
query = """
SELECT
    end_time, artist_name, track_name, ms_played, tags.tag_name
FROM
    user_data
LEFT JOIN
    artists
    ON user_data.artist_name = artists.name
LEFT JOIN
    tracks
    ON user_data.track_name = tracks.name AND artists.mbid = tracks.artist_mbid
LEFT JOIN
    top_track_tags ON tracks.mbid = top_track_tags.track_mbid
LEFT JOIN
    tags ON top_track_tags.tag_id = tags.rowid
"""

df = pd.read_sql(query, con=conn)
df.head()

Unnamed: 0,end_time,artist_name,track_name,ms_played,tag_name
0,2020-10-02 23:34:00,Lenny Kravitz,Fly Away,221332,rock
1,2020-10-02 23:36:00,The Yardbirds,For Your Love,150626,classic rock
2,2020-10-02 23:40:00,Fleetwood Mac,Little Lies - 2017 Remaster,221053,80s
3,2020-10-02 23:45:00,Eagles,One of These Nights - 2013 Remaster,291685,classic rock
4,2020-10-02 23:50:00,Audioslave,Show Me How to Live,277840,rock


## Lendo os dados classificados que relacionam tags e generos

In [4]:
with open('classified_top_tags.json', 'r') as f:
    # Lista de dicionarios
    top_tag_genres = json.load(f)

In [5]:
tag_to_genre_data = []
for data in top_tag_genres:
    tag_name = data['data']['item']['tag_name']
    genre = data['annotations'][0]['result'][0]['value']['choices'][0]
    tag_to_genre_data.append({
        'tag_name': tag_name,
        'genre': genre
    })

tag_to_genre = pd.DataFrame(tag_to_genre_data)
tag_to_genre

Unnamed: 0,tag_name,genre
0,50s,Other
1,goeiepoep,Other
2,liquid funk,Other
3,2pac,Rap
4,seen live,Other
...,...,...
161,80s,Rock
162,hard rock,Rock
163,Progressive rock,Rock
164,rock,Rock


## Lendo dado dos artitas previamente sem tags

In [6]:
with open('classified_artists.json', 'r') as f:
    # Lista de dicionarios
    untagged_artists_genres = json.load(f)

In [7]:
artist_to_genre_data = []
for data in untagged_artists_genres:
    artist_name = data['data']['item']['artist_name']
    genre = data['annotations'][0]['result'][0]['value']['choices'][0]
    artist_to_genre_data.append({
        'artist_name': artist_name,
        'genre': genre
    })

artist_to_genre = pd.DataFrame(artist_to_genre_data)
artist_to_genre

Unnamed: 0,artist_name,genre
0,Florence Bellon,Classical
1,Snarky Puppy,Electronic
2,A Beacon School,Electronic
3,Antonio Reyes,Other
4,Warren Haynes,Rock
...,...,...
380,Blue Öyster Cult,Rock
381,Led Zeppelin,Rock
382,Mikel,LoFi
383,Nujabes,Jazz


In [13]:
# Dando left join tanto no genero quanto nos artistas eu consigo pegar os generos.
# Confiro que manteve o mesmo numero de linhas visto que quero mantar alguns generos como NULL ainda
# que são os que ainda nao foram classificados e vão ficar como 'Outros' agora por default
print(len(df))
merged = df.merge(tag_to_genre, how='left', on='tag_name',).merge(artist_to_genre, how='left', on='artist_name')
print(len(merged))
merged

56907
56907


Unnamed: 0,end_time,artist_name,track_name,ms_played,tag_name,genre_x,genre_y
0,2020-10-02 23:34:00,Lenny Kravitz,Fly Away,221332,rock,Rock,
1,2020-10-02 23:36:00,The Yardbirds,For Your Love,150626,classic rock,Rock,
2,2020-10-02 23:40:00,Fleetwood Mac,Little Lies - 2017 Remaster,221053,80s,Rock,Rock
3,2020-10-02 23:45:00,Eagles,One of These Nights - 2013 Remaster,291685,classic rock,Rock,Rock
4,2020-10-02 23:50:00,Audioslave,Show Me How to Live,277840,rock,Rock,Rock
...,...,...,...,...,...,...,...
56902,2020-12-15 14:29:00,String Player Gamer,"Theme Of Morroc (from ""Ragnarok Online"")",27309,,,
56903,2020-12-18 18:56:00,String Player Gamer,"Theme Of Morroc (from ""Ragnarok Online"")",8800,,,
56904,2021-01-02 17:47:00,Nas,Got Ur Self A Gun,58350,,,Rap
56905,2021-01-03 19:58:00,Nas,Watch Dem Niggas (feat. Foxy Brown),231733,,,Rap


In [14]:
# Quero juntar o genre_x e genre_y em um só genero.
# Eu vou dar preferência pro Y, que é o que foi classificado pelo nome do artista.
merged['genre'] = merged.apply(lambda row: row['genre_y'] if not pd.isna(row['genre_y']) else row['genre_x'],axis=1)

# Dou drop nas colunas antigas
merged.drop(columns=['genre_x', 'genre_y', 'tag_name'], inplace=True)

In [19]:
merged

Unnamed: 0,end_time,artist_name,track_name,ms_played,genre
0,2020-10-02 23:34:00,Lenny Kravitz,Fly Away,221332,Rock
1,2020-10-02 23:36:00,The Yardbirds,For Your Love,150626,Rock
2,2020-10-02 23:40:00,Fleetwood Mac,Little Lies - 2017 Remaster,221053,Rock
3,2020-10-02 23:45:00,Eagles,One of These Nights - 2013 Remaster,291685,Rock
4,2020-10-02 23:50:00,Audioslave,Show Me How to Live,277840,Rock
...,...,...,...,...,...
56902,2020-12-15 14:29:00,String Player Gamer,"Theme Of Morroc (from ""Ragnarok Online"")",27309,
56903,2020-12-18 18:56:00,String Player Gamer,"Theme Of Morroc (from ""Ragnarok Online"")",8800,
56904,2021-01-02 17:47:00,Nas,Got Ur Self A Gun,58350,Rap
56905,2021-01-03 19:58:00,Nas,Watch Dem Niggas (feat. Foxy Brown),231733,Rap


In [21]:
# Substitui 'NaN' por 'Other'
merged['genre'] = merged['genre'].apply(lambda val: 'Other' if pd.isna(val) else val)

In [22]:
merged.to_csv('my_spotify_data.csv', index=False)