In [None]:
import sqlite3
import pandas as pd

In [None]:
# Settings
SAMPLE_SIZE = 200000
PATH_DB = './spotify.sqlite'
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
con = sqlite3.connect(f'file:{PATH_DB}?mode=ro', uri=True)
df_tables = pd.read_sql_query('''
    SELECT
        tbl_name, sql
    FROM
        sqlite_master
    WHERE
        type = 'table'
''', con)
con.close()
# Uncomment the line below to see all the tables and their columns
#df_tables

In [None]:
con = sqlite3.connect(f'file:{PATH_DB}?mode=ro', uri=True)
con.text_factory = lambda x: str(x, 'latin1')
df = pd.read_sql_query(f'''
    SELECT
        t.id,
        t.name,
        t.duration,
        t.explicit,
        t.popularity,
        af.analysis_url,
        af.loudness,
        af.tempo,
        af.time_signature,
        af.key,
        af.mode,
        af.acousticness,
        af.danceability,
        af.energy,
        af.instrumentalness,
        af.liveness,
        af.speechiness,
        af.valence,
        GROUP_CONCAT(DISTINCT art.id) AS artist_ids,
        GROUP_CONCAT(DISTINCT art.name) AS artist_names,
        GROUP_CONCAT(DISTINCT art.followers) AS artist_followers,
        GROUP_CONCAT(DISTINCT art.popularity) AS artist_popularity,
        GROUP_CONCAT(DISTINCT alb.id) AS album_ids,
        GROUP_CONCAT(DISTINCT alb.name) AS album_names,
        GROUP_CONCAT(DISTINCT alb.album_type) AS album_types,
        GROUP_CONCAT(DISTINCT alb.release_date) AS album_release_dates,
        GROUP_CONCAT(DISTINCT alb.popularity) AS album_popularities,
        GROUP_CONCAT(DISTINCT g.id) AS genres
    FROM
        (
            SELECT
                *
            FROM
                tracks_indexed
            ORDER BY
                popularity DESC
            LIMIT
                {SAMPLE_SIZE}
        ) AS t
    INNER JOIN
        audio_features_indexed AS af ON t.id = af.id
    LEFT JOIN
        r_track_artist AS rta ON t.id = rta.track_id
    LEFT JOIN
        artists_indexed AS art ON rta.artist_id = art.id
    LEFT JOIN
        r_albums_tracks AS rat ON t.id = rat.track_id
    LEFT JOIN
        albums_indexed AS alb ON rat.album_id = alb.id
    LEFT JOIN
        r_artist_genre AS rag ON rag.artist_id = art.id
    LEFT JOIN
        genres AS g ON rag.genre_id = g.id
    GROUP BY
        t.id
''', con)
con.close()
df.head()

In [None]:
df.to_csv(f'top_{SAMPLE_SIZE}_songs.csv', index=False)