# DS 22/23 2A Project

### Load downloaded Spotify Streaming data
The data is split up in multiple JSON files. We need to all read in all files and merge them into one big DataFrame for further processing.

In [28]:
# Load raw streaming history data
import os
import pandas as pd

DATA_PATH = './data/'

# List of all relevant files
file_list = [
    DATA_PATH + filename for filename in os.listdir(DATA_PATH) if filename != '.gitkeep']

# List of Dataframes for each file
df_list = [pd.read_json(file) for file in file_list]

# Concatenating all df's together
df = pd.concat(df_list)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 174902 entries, 0 to 16487
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   ts                                 174902 non-null  object 
 1   username                           174902 non-null  object 
 2   platform                           174902 non-null  object 
 3   ms_played                          174902 non-null  int64  
 4   conn_country                       174902 non-null  object 
 5   ip_addr_decrypted                  174902 non-null  object 
 6   user_agent_decrypted               172199 non-null  object 
 7   master_metadata_track_name         166423 non-null  object 
 8   master_metadata_album_artist_name  166423 non-null  object 
 9   master_metadata_album_album_name   166423 non-null  object 
 10  spotify_track_uri                  166423 non-null  object 
 11  episode_name                       7856 

## Setup Database and insatntiate engine
The util functions help us to create all required tables for the designed star schema. The used database credentials can be set in the .env file via the following variables:
- DB_DRIVER
- DB_NAME
- DB_USERNAME
- DB_PASSWORD
- DB_SERVER
- DB_PORT

In [29]:
from database import get_sql_engine, create_tables

create_tables()
engine = get_sql_engine()

## Setup Spotify API connection
We make use of the spotipy package to interact with the public spotify web api. This is required to enrich the streaming history data with information on songs, genres, artist and musical features. To initiialize the connection the credentials must be set in the .env file via the variables:
- SPOTIFY_CLIENT_ID
- SPOTIFY_CLIENT_SECRET

In [30]:
# Spotify API Integration
import os

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

CLIENT_ID = os.environ['SPOTIFY_CLIENT_ID']
CLIENT_SECRET = os.environ['SPOTIFY_CLIENT_SECRET']

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET), retries=5, backoff_factor=0.5)

## Song Data preprocessing

In [31]:
song_df = df[['master_metadata_track_name', 'spotify_track_uri']]
song_df = song_df.dropna()
song_df = song_df.drop_duplicates()
song_df = song_df.rename(columns={
    'master_metadata_track_name': 'name',
    'spotify_track_uri': 'spotify_id',
})
song_df = song_df.reset_index(drop=True)

### Song data enrichment

In [32]:
from tqdm.auto import tqdm

chunk_size = 50

for n in tqdm(range(0, len(song_df), chunk_size)):
    # Prepare all Spotify internal Song IDs of the chunk for querrying
    song_ids = song_df.loc[n:n+chunk_size-1, 'spotify_id'].tolist()
    
    # Query Spotify API
    try:
        tracks_result = sp.tracks(song_ids)
        audio_features_result = sp.audio_features(song_ids)
    except spotipy.client.SpotifyException as e:
        print("Spotipy Exception: "+e)
        continue

    tracks = tracks_result['tracks']
    features = audio_features_result

    artist_names = [track['artists'][0]['name']
                    if track is not None else None for track in tracks]
    artist_uris = [track['artists'][0]['uri']
                   if track is not None else None for track in tracks]
    track_uris = [track['uri']
                  if track is not None else None for track in tracks]
    track_durations = [track['duration_ms']
                       if track is not None else None for track in tracks]
    track_explicit = [track['explicit']
                      if track is not None else None for track in tracks]
    track_popularities = [track['popularity']
                          if track is not None else None for track in tracks]
    features_key = [feat['key']
                    if feat is not None else None for feat in features]
    features_mode = [feat['mode']
                     if feat is not None else None for feat in features]
    features_danceability = [feat['danceability']
                             if feat is not None else None for feat in features]
    features_energy = [feat['energy']
                       if feat is not None else None for feat in features]
    features_loudness = [feat['loudness']
                         if feat is not None else None for feat in features]
    features_speechiness = [feat['speechiness']
                            if feat is not None else None for feat in features]
    features_acousticness = [feat['acousticness']
                              if feat is not None else None for feat in features]
    features_instrumentalness = [feat['instrumentalness']
                                 if feat is not None else None for feat in features]
    features_liveness = [feat['liveness']
                         if feat is not None else None for feat in features]
    features_valence = [feat['valence']
                        if feat is not None else None for feat in features]
    features_tempo = [feat['tempo']
                      if feat is not None else None for feat in features]

    song_df.loc[n:n+chunk_size-1, 'artist_name'] = artist_names
    song_df.loc[n:n+chunk_size-1, 'artist_uri'] = artist_uris
    song_df.loc[n:n+chunk_size-1, 'uri'] = track_uris
    song_df.loc[n:n+chunk_size-1, 'duration'] = track_durations
    song_df.loc[n:n+chunk_size-1, 'explicitness'] = track_explicit
    song_df.loc[n:n+chunk_size-1, 'popularity'] = track_popularities
    song_df.loc[n:n+chunk_size-1, 'key'] = features_key
    song_df.loc[n:n+chunk_size-1, 'mode'] = features_mode
    song_df.loc[n:n+chunk_size-1, 'danceability'] = features_danceability
    song_df.loc[n:n+chunk_size-1, 'energy'] = features_energy
    song_df.loc[n:n+chunk_size-1, 'loudness'] = features_loudness
    song_df.loc[n:n+chunk_size-1, 'speechiness'] = features_speechiness
    song_df.loc[n:n+chunk_size-1, 'acousticness'] = features_acousticness
    song_df.loc[n:n+chunk_size-1, 'instrumentalness'] = features_instrumentalness
    song_df.loc[n:n+chunk_size-1, 'liveness'] = features_liveness
    song_df.loc[n:n+chunk_size-1, 'valence'] = features_valence
    song_df.loc[n:n+chunk_size-1, 'tempo'] = features_tempo

song_df


  0%|          | 0/918 [00:00<?, ?it/s]

Unnamed: 0,name,spotify_id,artist_name,artist_uri,uri,duration,explicitness,popularity,key,mode,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Like I Love You - Video Edit,spotify:track:0RGmR9MxM5vhEgNlFhBEDo,R.I.O.,spotify:artist:0Ol3Jol2T3lZZVLNNzWPhj,spotify:track:0RGmR9MxM5vhEgNlFhBEDo,203000.0,False,0.0,9.0,1.0,0.702,0.648,-8.059,0.0291,0.0228,0.000191,0.193,0.742,130.012
1,Rather Be (feat. Jess Glynne),spotify:track:0TVV2gFROJaB3kIZyCUvIY,Clean Bandit,spotify:artist:6MDME20pz9RveH9rEXvrOM,spotify:track:0TVV2gFROJaB3kIZyCUvIY,227833.0,False,61.0,11.0,1.0,0.799,0.586,-6.735,0.0377,0.162,0.000002,0.193,0.549,120.97
2,Feel Your Love Tonight - 2015 Remaster,spotify:track:2fZTVlNbTrOAt05ohKJ3ym,Van Halen,spotify:artist:2cnMpRsOVqtPMfq7YiFE6K,spotify:track:2fZTVlNbTrOAt05ohKJ3ym,220960.0,False,49.0,8.0,1.0,0.46,0.862,-5.225,0.0594,0.0661,0.0,0.188,0.724,135.464
3,CRUDELIA - I nervi,spotify:track:5dsz7MTrNdN9aMTrnp7sOG,Marracash,spotify:artist:5AZuEF0feCXMkUCwQiQlW7,spotify:track:5dsz7MTrNdN9aMTrnp7sOG,230922.0,True,64.0,7.0,1.0,0.547,0.554,-7.563,0.102,0.149,0.0,0.344,0.0702,82.178
4,Mantra,spotify:track:0MOY9Mg3kcHj7vwKquG8WD,Mädness,spotify:artist:7r29hnM0AiX10xqC7WKXh0,spotify:track:0MOY9Mg3kcHj7vwKquG8WD,160617.0,True,0.0,5.0,1.0,0.52,0.32,-14.123,0.0583,0.862,0.0556,0.0939,0.368,73.381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45856,Lean Back,spotify:track:4K0ifpgptOfb3eHHiiwjyT,Terror Squad,spotify:artist:2BvzbqWWwLN11XGBYgDZzx,spotify:track:4K0ifpgptOfb3eHHiiwjyT,247426,True,36,1,1,0.799,0.916,-3.344,0.471,0.11,0,0.0746,0.697,126.687
45857,Acker jeden Tag,spotify:track:4lgP6VX9RwEkV2UG6Aeway,Ufo361,spotify:artist:5pVRwX5ZQR7hfJ18w8ZYkl,spotify:track:4lgP6VX9RwEkV2UG6Aeway,257233,True,43,11,0,0.88,0.724,-4.047,0.0758,0.218,0.00099,0.301,0.444,100.982
45858,OFF,spotify:track:7eg9RsETLgMaYcobok7y8B,Chakuza,spotify:artist:7sGj46FhEm7c19hArcGtlY,spotify:track:7eg9RsETLgMaYcobok7y8B,225360,False,9,1,0,0.768,0.494,-8.777,0.153,0.252,0.000001,0.115,0.197,82.985
45859,Die To Live,spotify:track:7u9r4j7KSMW5yLsgjLRZGZ,Volbeat,spotify:artist:0L5fC7Ogm2YwgqVCRcF1bT,spotify:track:7u9r4j7KSMW5yLsgjLRZGZ,182466,False,57,9,1,0.523,0.975,-2.925,0.119,0.0186,0.449,0.353,0.356,92.994


In [33]:
emotions = [
    {"label": "angry", "x": -0.42, "y": 0.79},
    {"label": "hateful", "x": -0.58, "y": 0.85},
    {"label": "defient", "x": -0.61, "y": 0.72},
    {"label": "contemptuous", "x": -0.57, "y": 0.66},
    {"label": "afraid", "x": -0.11, "y": 0.79},
    {"label": "enraged", "x": -0.17, "y": 0.72},
    {"label": "annoyed", "x": -0.44, "y": 0.67},
    {"label": "alarmed", "x": -0.08, "y": 0.89},
    {"label": "tense", "x": -0.02, "y": 0.86},
    {"label": "bellicose", "x": -0.11, "y": 0.96},
    {"label": "hostile", "x": -0.28, "y": 0.89},
    {"label": "envious", "x": -0.28, "y": 0.82},
    {"label": "distressed", "x": -0.69, "y": 0.56},
    {"label": "disgusted", "x": -0.67, "y": 0.49},
    {"label": "loathing", "x": -0.8, "y": 0.42},
    {"label": "frustrated", "x": -0.6, "y": 0.39},
    {"label": "discontented", "x": -0.68, "y": 0.32},
    {"label": "bitter", "x": -0.8, "y": 0.26},
    {"label": "startled", "x": -0.91, "y": 0.02},
    {"label": "jealous", "x": -0.08, "y": 0.56},
    {"label": "indignant", "x": -0.24, "y": 0.46},
    {"label": "impatient", "x": -0.04, "y": 0.3},
    {"label": "insulted", "x": -0.73, "y": 0.2},
    {"label": "suspicious", "x": -0.32, "y": 0.27},
    {"label": "distrustful", "x": -0.48, "y": 0.1},
    {"label": "miserable", "x": -0.92, "y": -0.13},
    {"label": "disappointed", "x": -0.8, "y": -0.03},
    {"label": "dissatisfied", "x": -0.61, "y": -0.17},
    {"label": "uncomfortable", "x": -0.68, "y": -0.37},
    {"label": "worried", "x": -0.08, "y": -0.33},
    {"label": "apathetic", "x": -0.2, "y": -0.12},
    {"label": "taken aback", "x": -0.41, "y": -0.22},
    {"label": "guilty", "x": -0.4, "y": -0.42},
    {"label": "languid", "x": -0.23, "y": -0.5},
    {"label": "sad", "x": -0.82, "y": -0.4},
    {"label": "despondent", "x": -0.57, "y": -0.42},
    {"label": "gloomy", "x": -0.88, "y": -0.46},
    {"label": "depressed", "x": -0.81, "y": -0.46},
    {"label": "desperate", "x": -0.8, "y": -0.5},
    {"label": "wavering", "x": -0.57, "y": -0.7},
    {"label": "anxious", "x": -0.72, "y": -0.8},
    {"label": "dejected", "x": -0.5, "y": -0.86},
    {"label": "ashamed", "x": -0.42, "y": -0.5},
    {"label": "embarrassed", "x": -0.31, "y": -0.59},
    {"label": "melancholic", "x": -0.04, "y": -0.67},
    {"label": "hesistant", "x": -0.31, "y": -0.72},
    {"label": "bored", "x": -0.33, "y": -0.79},
    {"label": "droopy", "x": -0.32, "y": -0.92},
    {"label": "doubtful", "x": -0.28, "y": -0.95},
    {"label": "tired", "x": -0.02, "y": -0.97},
    {"label": "astonished", "x": 0.42, "y": 0.89},
    {"label": "aroused", "x": 0.38, "y": 0.92},
    {"label": "adventurous", "x": 0.49, "y": 0.92},
    {"label": "lusting", "x": 0.22, "y": 0.85},
    {"label": "conceited", "x": 0.19, "y": 0.67},
    {"label": "ambituous", "x": 0.41, "y": 0.67},
    {"label": "feeling superior", "x": 0.32, "y": 0.56},
    {"label": "excited", "x": 0.7, "y": 0.72},
    {"label": "triumphant", "x": 0.65, "y": 0.79},
    {"label": "selfconfident", "x": 0.81, "y": 0.67},
    {"label": "courageous", "x": 0.81, "y": 0.59},
    {"label": "convinced", "x": 0.42, "y": 0.42},
    {"label": "light hearted", "x": 0.41, "y": 0.29},
    {"label": "passionate", "x": 0.32, "y": 0.13},
    {"label": "expectant", "x": 0.32, "y": 0.06},
    {"label": "delighted", "x": 0.88, "y": 0.35},
    {"label": "enthusiastic", "x": 0.5, "y": 0.32},
    {"label": "determined", "x": 0.74, "y": 0.27},
    {"label": "happy", "x": 0.9, "y": 0.16},
    {"label": "joyous", "x": 0.94, "y": 0.12},
    {"label": "interested", "x": 0.65, "y": 0.02},
    {"label": "impressed", "x": 0.38, "y": -0.07},
    {"label": "longing", "x": 0.22, "y": -0.43},
    {"label": "pleased", "x": 0.88, "y": -0.1},
    {"label": "feel well", "x": 0.91, "y": -0.07},
    {"label": "amorous", "x": 0.86, "y": -0.14},
    {"label": "glad", "x": 0.94, "y": -0.17},
    {"label": "hopeful", "x": 0.61, "y": -0.3},
    {"label": "solemn", "x": 0.81, "y": -0.47},
    {"label": "attentive", "x": 0.48, "y": -0.47},
    {"label": "serene", "x": 0.83, "y": -0.5},
    {"label": "content", "x": 0.81, "y": -0.56},
    {"label": "at ease", "x": 0.77, "y": -0.59},
    {"label": "friendly", "x": 0.75, "y": -0.59},
    {"label": "contemplative", "x": 0.59, "y": -0.59},
    {"label": "satisfied", "x": 0.76, "y": 0.63},
    {"label": "serious", "x": 0.23, "y": -0.67},
    {"label": "pensive", "x": 0.03, "y": -0.59},
    {"label": "polite", "x": 0.37, "y": -0.67},
    {"label": "conscientious", "x": 0.31, "y": -0.79},
    {"label": "relaxed", "x": 0.72, "y": -0.66},
    {"label": "peaceful", "x": 0.55, "y": -0.79},
    {"label": "compassionate", "x": 0.37, "y": -0.92},
    {"label": "reverent", "x": 0.22, "y": -0.96},
    {"label": "sleepy", "x": 0.02, "y": -0.97}
]

emotions_df = pd.DataFrame(emotions)
emotions_df['x'] = (emotions_df['x'] + 1) / 2
emotions_df['y'] = (emotions_df['y'] + 1) / 2

song_df = song_df.dropna(subset=['energy', 'valence'])
song_df['emotion'] = song_df[['energy', 'valence']].apply(lambda x: emotions_df.iloc[(
    emotions_df['x'] - x[0]).abs().argsort()[:1]]['label'].values[0], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_df['emotion'] = song_df[['energy', 'valence']].apply(lambda x: emotions_df.iloc[(


In [34]:
song_table_df = song_df.drop(
    columns=['artist_name', 'artist_uri', 'uri'], axis=1)

song_table_df['name'] = song_table_df['name'].str.encode(
    'latin-1', 'ignore').str.decode('latin-1')

song_table_df.to_sql('songs', engine, schema='project',
                     index=True, index_label='id', if_exists='append')


InternalError: (psycopg2.errors.DependentObjectsStillExist) cannot drop table project.songs because other objects depend on it
DETAIL:  constraint song_fkey on table project.streams depends on table project.songs
HINT:  Use DROP ... CASCADE to drop the dependent objects too.

[SQL: 
DROP TABLE project.songs]
(Background on this error at: https://sqlalche.me/e/20/2j85)

## Artist Data Preprocessing

In [None]:
from ast import literal_eval

artist_df = song_df[['artist_name', 'artist_uri']]
artist_df = artist_df.drop_duplicates().dropna()
artist_df = artist_df.reset_index(drop=True)

chunk_size = 50

for n in tqdm(range(0, len(artist_df), chunk_size)):
    artists_ids = artist_df.loc[n:n+chunk_size-1, 'artist_uri'].tolist()
    try:
        artists_result = sp.artists(artists_ids)
        artists_result = artists_result['artists']
    except spotipy.client.SpotifyException as e:
        print("Spotipy Exception: "+e)
        continue

    popularities = [artist['popularity']
                    if artist is not None else None for artist in artists_result]
    genres = [str(artist['genres'])
                    if artist is not None else None for artist in artists_result]

    artist_df.loc[n:n+chunk_size-1, 'popularity'] = popularities
    artist_df.loc[n:n+chunk_size-1, 'genres'] = genres

artist_df['genres'] = artist_df['genres'].apply(literal_eval)

  0%|          | 0/285 [00:00<?, ?it/s]

In [36]:
artist_table_df = artist_df[['artist_name', 'artist_uri', 'popularity']]
artist_table_df = artist_table_df.rename(columns={
    'artist_name': 'name',
    'artist_uri': 'spotify_id'
})
artist_table_df['name'] = artist_table_df['name'].str.encode(
    'latin-1', 'ignore').str.decode('latin-1')
artist_table_df.to_sql('artists', engine, schema='project',
                       index=True, index_label='id', if_exists='append')


219

## Genre Data Preprocessing

In [40]:
genre_df = artist_df['genres'].to_frame().explode('genres')
genre_df = genre_df.drop_duplicates().dropna().reset_index(drop=True)
genre_df = genre_df.rename(columns={'genres':'name'})

enao_df = pd.read_csv('enao.csv')
enao_df = enao_df[['name', 'cluster_name']]

genre_df = genre_df.join(enao_df.set_index('name'), on='name', how='left')
genre_df = genre_df.rename(columns={'cluster_name':'cluster'})

genre_df.to_sql('genres', engine, schema='project',
                index=True, index_label='id', if_exists='append')


439

In [41]:
link_df = artist_df.explode('genres')
link_df = link_df.rename(columns={'genres': 'name'})

genre_df['genre_id'] = genre_df.index

artist_genre_id_df = pd.merge(link_df, genre_df, on='name', how='left')
artist_genre_id_df

Unnamed: 0,artist_name,artist_uri,popularity,name,cluster,genre_id
0,R.I.O.,spotify:artist:0Ol3Jol2T3lZZVLNNzWPhj,62.0,dance pop,latin pop,0.0
1,R.I.O.,spotify:artist:0Ol3Jol2T3lZZVLNNzWPhj,62.0,german dance,pop rock,1.0
2,R.I.O.,spotify:artist:0Ol3Jol2T3lZZVLNNzWPhj,62.0,pop house,pop rock,2.0
3,Clean Bandit,spotify:artist:6MDME20pz9RveH9rEXvrOM,76.0,dance pop,latin pop,0.0
4,Clean Bandit,spotify:artist:6MDME20pz9RveH9rEXvrOM,76.0,edm,pop rock,3.0
...,...,...,...,...,...,...
35071,Vluestar,spotify:artist:2FcgwIGEPmPyItFPscscDO,58.0,aesthetic rap,r&b,2076.0
35072,Vluestar,spotify:artist:2FcgwIGEPmPyItFPscscDO,58.0,lo-fi chill,desi pop,456.0
35073,Vluestar,spotify:artist:2FcgwIGEPmPyItFPscscDO,58.0,sad lo-fi,desi pop,457.0
35074,Vluestar,spotify:artist:2FcgwIGEPmPyItFPscscDO,58.0,sad rap,r&b,458.0


## Stream Data preprocessing

In [42]:
stream_df = df[['ts', 'spotify_track_uri', 'shuffle',
                'skipped', 'conn_country', 'ms_played']]
stream_df = stream_df.rename(columns={'conn_country': 'country', 'ms_played': 'duration'})
stream_df = stream_df.dropna(axis=0, subset=['spotify_track_uri'])

song_id_df = song_df[['spotify_id', 'artist_uri']]
song_id_df['song_id'] = song_df.index

artist_id_df = artist_df[['artist_uri', 'genres']]
artist_id_df['artist_id'] = artist_df.index

song_id_df = song_id_df.merge(artist_id_df, on='artist_uri', how='inner')

stream_df = stream_df.merge(song_id_df, left_on='spotify_track_uri', right_on="spotify_id")
stream_df = stream_df.drop(
    columns=['spotify_track_uri', 'spotify_id', 'artist_uri'], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_id_df['song_id'] = song_df.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_id_df['artist_id'] = artist_df.index


In [43]:
stream_date_df = stream_df[['ts']]
stream_date_df['ts'] = pd.to_datetime(stream_date_df['ts'])
stream_date_df['stream_id'] = stream_df.index

stream_date_df = stream_date_df.rename(columns={'ts': 'start_time'})
stream_date_df['year'] = stream_date_df['start_time'].dt.year
stream_date_df['month'] = stream_date_df['start_time'].dt.month
stream_date_df['day'] = stream_date_df['start_time'].dt.day
stream_date_df['hour'] = stream_date_df['start_time'].dt.hour
stream_date_df['weekday'] = stream_date_df['start_time'].dt.weekday

stream_date_df = stream_date_df.drop(columns=['stream_id'], axis=1)
stream_date_df.to_sql('dates', engine,
                      schema='project', if_exists='append', index=True, index_label='id')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stream_date_df['ts'] = pd.to_datetime(stream_date_df['ts'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stream_date_df['stream_id'] = stream_df.index


636

In [44]:
stream_table_df = stream_df.drop(columns=['genres', 'ts'], axis=1)
stream_table_df['date_id'] = stream_date_df.index
stream_table_df['skipped'] = [True if skip ==
                              1 else False for skip in stream_table_df['skipped'].tolist()]
stream_table_df.to_sql('streams', engine, schema='project',
                 index=True, index_label='id', if_exists='append')


636

In [45]:
stream_genre_df = stream_df[['genres']]
stream_genre_df['stream_id'] = stream_df.index

stream_genre_df = stream_genre_df.explode('genres').dropna()
stream_genre_df = stream_genre_df.rename(columns={'genres': 'name'})
stream_genre_df = stream_genre_df.merge(genre_df, on="name", how='left').drop(columns=['name', 'cluster'], axis=1)

stream_genre_df.to_sql('streams_genres', engine,
                       schema='project', if_exists='append', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stream_genre_df['stream_id'] = stream_df.index


116

In [None]:
stream_table_df.sample()

Unnamed: 0,shuffle,skipped,country,duration,song_id,artist_id,date_id
115126,True,False,DE,9277,15319,773,115126


In [49]:
stream_table_df.groupby('song_id').agg({'duration': 'sum'}).sort_values(by='duration', ascending=False).head(10).join(song_df[['name', 'popularity']], on='song_id', how='left')


Unnamed: 0_level_0,duration,name,popularity
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
127,39559770,Lass uns scheinen,28.0
866,33649709,Checkpoint (Nie Game Over),42.0
588,29646355,You Can't Always Get What You Want,67.0
1066,27678335,Steine & Draht,25.0
103,27586877,Ypsilon,0.0
657,27108165,Vor Adams Zeiten,22.0
955,25153557,Eine Million,0.0
1333,24724859,Ramble On - 1990 Remaster,72.0
247,24157201,Alles zieht vorbei,39.0
913,23817506,Acker jeden Tag,3.0
