### Join Spotify datasets

In [1]:
import pandas as pd

In [None]:

def load_spotify_data():
    """Load all Spotify CSV files into dataframes"""
    files = {
        'artists': 'data/sp_artist.csv',
        'artist_release': 'data/sp_artist_release.csv',
        'artist_track': 'data/sp_artist_track.csv',
        'releases': 'data/sp_release.csv',
        'tracks': 'data/sp_track.csv'
    }
    
    dfs = {}
    for key, file in files.items():
        dfs[key] = pd.read_csv(file)
    return dfs

def create_spotify_dataset():
    """Create complete dataset by joining all Spotify data"""
    print("Loading Spotify data...")
    dfs = load_spotify_data()
    
    print("Starting joins...")
    
    # 1. Start with tracks as the base
    print("Processing track information...")
    df = dfs['tracks']
    
    # 2. Add release information
    print("Adding release information...")
    df = pd.merge(
        df,
        dfs['releases'],
        on='release_id',
        how='left',
        suffixes=('', '_release')
    )
    
    # 3. Add artist relationships through artist_track
    print("Adding artist information...")
    df = pd.merge(
        df,
        dfs['artist_track'],
        on=['track_id'],
        how='left',
        suffixes=('', '_artist_track')
    )
    
    # 4. Add artist details
    df = pd.merge(
        df,
        dfs['artists'],
        on='artist_id',
        how='left',
        suffixes=('', '_artist')
    )
    
    # Clean up columns
    print("Cleaning up the dataset...")
    # Remove duplicate updated_on columns
    columns_to_drop = [col for col in df.columns if col.startswith('updated_on_')]
    df = df.drop(columns=columns_to_drop)
    
    return df

def generate_spotify_report(df):
    """Generate a report about the Spotify dataset"""
    print("\nSpotify Dataset Report:")
    print("-" * 50)
    print(f"Total number of rows: {len(df)}")
    print(f"Total number of columns: {len(df.columns)}")
    print("\nUnique counts:")
    print(f"Tracks: {df['track_id'].nunique()}")
    print(f"Artists: {df['artist_id'].nunique()}")
    print(f"Releases: {df['release_id'].nunique()}")
    
    print("\nRelease Types Distribution:")
    print(df['album_type'].value_counts())
    
    print("\nTrack Duration Statistics (in seconds):")
    print(df['duration_ms'].div(1000).describe())
    
    print("\nMissing values summary:")
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    print(missing)

def main():
    # Create the complete dataset
    print("Starting Spotify data integration process...")
    final_df = create_spotify_dataset()
    
    # Generate and display report
    generate_spotify_report(final_df)
    
    # Save the final dataset
    print("\nSaving complete Spotify dataset...")
    final_df.to_csv('spotify_complete_data.csv', index=False)
    print("Dataset saved successfully!")
    
    return final_df

if __name__ == "__main__":
    spotify_df = main()

Starting Spotify data integration process...
Loading Spotify data...
Starting joins...
Processing track information...
Adding release information...
Adding artist information...
Cleaning up the dataset...

Spotify Dataset Report:
--------------------------------------------------
Total number of rows: 7716591
Total number of columns: 20

Unique counts:
Tracks: 5777707
Artists: 676911
Releases: 713563

Release Types Distribution:
album_type
compilation    3200510
single         2541820
album          1974261
Name: count, dtype: int64

Track Duration Statistics (in seconds):
count    7.716591e+06
mean     3.401985e+02
std      2.312070e+02
min      0.000000e+00
25%      2.470010e+02
50%      3.440000e+02
75%      4.135380e+02
max      3.890000e+05
Name: duration_ms, dtype: float64

Missing values summary:
track_title         29
isrc                61
preview_url      48684
release_title       21
upc                 60
release_img       6288
label_name          71
artist_name         84
d

In [4]:
def merge_spotify_audio_features():
    # Read CSV files
    spotify_df = pd.read_csv('spotify_complete_data.csv')
    audio_features_df = pd.read_csv('data/audio_features.csv')
    
    # Merge dataframes on ISRC
    merged_df = pd.merge(
        spotify_df,
        audio_features_df,
        on='isrc',
        how='inner',
        suffixes=('', '_audio')
    )
    
    # Drop duplicate columns from the merge
    duplicate_cols = [col for col in merged_df.columns if col.endswith('_audio')]
    merged_df = merged_df.drop(columns=duplicate_cols)
    
    return merged_df

# Execute merge
merged_spotify = merge_spotify_audio_features()
print(f"Merged dataframe shape: {merged_spotify.shape}")
print("\nFirst few rows of merged data:")
print(merged_spotify.head())

Merged dataframe shape: (7523890, 32)

First few rows of merged data:
                 track_id                                   track_title  \
0  05h7hLxcBXnM3dgUZi7YmC                                      Suckhole   
1  07E5VE2mJxaEESXkOHIq4J                                       Terrain   
2  0kg1lYyW3yAGgF7skgn2nx                        Groove with You - Live   
3  0xvdwCZkGpfA2kJ4rsIW8M  Livin' in the Life / Go for Your Guns - Live   
4  1hTE9UU9I5XThxw1wurbLb                                        L.g. 1   

   duration_ms          isrc  track_number              release_id explicit  \
0       146878  DELJ82099977            23  2MDhYMdgU5GHMj70kc8Ja3        f   
1       260000  DELJ82099961             7  2MDhYMdgU5GHMj70kc8Ja3        f   
2       255666  USSM11501101             8  72MfvP136wxG7aeTUKzyJ7        f   
3       322573  USSM11501105            12  72MfvP136wxG7aeTUKzyJ7        f   
4        92000  DELJ82099960             6  2MDhYMdgU5GHMj70kc8Ja3        f   

   d

In [5]:
merged_spotify.head()

Unnamed: 0,track_id,track_title,duration_ms,isrc,track_number,release_id,explicit,disc_number,preview_url,updated_on,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,05h7hLxcBXnM3dgUZi7YmC,Suckhole,146878,DELJ82099977,23,2MDhYMdgU5GHMj70kc8Ja3,f,1,https://p.scdn.co/mp3-preview/1d7e30dbe20c810a...,2023-08-22 18:08:48,...,0.585,0.771,1,0.169,-15.897,1,0.0356,106,4,0.321
1,07E5VE2mJxaEESXkOHIq4J,Terrain,260000,DELJ82099961,7,2MDhYMdgU5GHMj70kc8Ja3,f,1,https://p.scdn.co/mp3-preview/d4ec1529eed983b2...,2023-08-22 18:08:48,...,0.846,0.311,9,0.482,-13.267,1,0.116,208,4,0.0505
2,0kg1lYyW3yAGgF7skgn2nx,Groove with You - Live,255666,USSM11501101,8,72MfvP136wxG7aeTUKzyJ7,f,1,https://p.scdn.co/mp3-preview/5f365a24295b2abc...,2023-08-22 18:08:48,...,0.436,2e-06,11,0.158,-10.25,0,0.0753,90,4,0.525
3,0xvdwCZkGpfA2kJ4rsIW8M,Livin' in the Life / Go for Your Guns - Live,322573,USSM11501105,12,72MfvP136wxG7aeTUKzyJ7,f,1,https://p.scdn.co/mp3-preview/132a1a5867131286...,2023-08-22 18:08:48,...,0.822,0.00513,10,0.0909,-6.74,0,0.0564,147,4,0.944
4,1hTE9UU9I5XThxw1wurbLb,L.g. 1,92000,DELJ82099960,6,2MDhYMdgU5GHMj70kc8Ja3,f,1,https://p.scdn.co/mp3-preview/b4590d953e661c6e...,2023-08-22 18:08:48,...,1.0,0.934,6,0.11,-19.087,1,0.0,0,0,0.0


In [6]:
merged_spotify = merged_spotify.drop(columns=["track_number", "explicit", "disc_number", "updated_on"])

In [7]:
merged_spotify

Unnamed: 0,track_id,track_title,duration_ms,isrc,release_id,preview_url,release_title,release_date,upc,popularity,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,05h7hLxcBXnM3dgUZi7YmC,Suckhole,146878,DELJ82099977,2MDhYMdgU5GHMj70kc8Ja3,https://p.scdn.co/mp3-preview/1d7e30dbe20c810a...,Ese Modal,2020-03-27,4.050216e+12,0,...,0.585,0.771000,1,0.1690,-15.897,1,0.0356,106,4,0.3210
1,07E5VE2mJxaEESXkOHIq4J,Terrain,260000,DELJ82099961,2MDhYMdgU5GHMj70kc8Ja3,https://p.scdn.co/mp3-preview/d4ec1529eed983b2...,Ese Modal,2020-03-27,4.050216e+12,0,...,0.846,0.311000,9,0.4820,-13.267,1,0.1160,208,4,0.0505
2,0kg1lYyW3yAGgF7skgn2nx,Groove with You - Live,255666,USSM11501101,72MfvP136wxG7aeTUKzyJ7,https://p.scdn.co/mp3-preview/5f365a24295b2abc...,Wild in Woodstock: The Isley Brothers Live at ...,1999,8.864454e+11,20,...,0.436,0.000002,11,0.1580,-10.250,0,0.0753,90,4,0.5250
3,0xvdwCZkGpfA2kJ4rsIW8M,Livin' in the Life / Go for Your Guns - Live,322573,USSM11501105,72MfvP136wxG7aeTUKzyJ7,https://p.scdn.co/mp3-preview/132a1a5867131286...,Wild in Woodstock: The Isley Brothers Live at ...,1999,8.864454e+11,20,...,0.822,0.005130,10,0.0909,-6.740,0,0.0564,147,4,0.9440
4,1hTE9UU9I5XThxw1wurbLb,L.g. 1,92000,DELJ82099960,2MDhYMdgU5GHMj70kc8Ja3,https://p.scdn.co/mp3-preview/b4590d953e661c6e...,Ese Modal,2020-03-27,4.050216e+12,0,...,1.000,0.934000,6,0.1100,-19.087,1,0.0000,0,0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7523885,34mF9n0g9JBt5RGWp5xDRf,Tension - Fractious RAW Remix,394262,GBKQU2093246,2YJLOe3ULjqNLS0cs2JzCy,https://p.scdn.co/mp3-preview/a699563541dbc561...,Tension EP,2020-10-02,5.054285e+12,0,...,0.669,0.875000,1,0.1090,-9.699,1,0.0572,132,4,0.1840
7523886,34mF9n0g9JBt5RGWp5xDRf,Tension - Fractious RAW Remix,394262,GBKQU2093246,2YJLOe3ULjqNLS0cs2JzCy,https://p.scdn.co/mp3-preview/a699563541dbc561...,Tension EP,2020-10-02,5.054285e+12,0,...,0.669,0.875000,1,0.1090,-9.699,1,0.0572,132,4,0.1840
7523887,2wcQIBlX6WvGDiL1fOGrkG,Burner - Original Mix,624048,FR6V80669207,3bYQi64hromnkTMrJ5Py2I,https://p.scdn.co/mp3-preview/cd2e76a9e55c1b35...,Ways Ep,2011,3.661586e+12,4,...,0.741,0.852000,1,0.1300,-7.747,1,0.0493,122,4,0.1270
7523888,15ma7Dqf0sR9CVoNeh6FlC,The Force - Club Mix Version,367595,FR6V80660645,18mmsjqmLsYX2nO8kESeGH,https://p.scdn.co/mp3-preview/c53e1ddc3843056b...,The Force,2011-05-29,3.661586e+12,0,...,0.786,0.748000,0,0.0868,-8.322,1,0.1490,128,4,0.4120


In [8]:
merged_spotify.columns

Index(['track_id', 'track_title', 'duration_ms', 'isrc', 'release_id',
       'preview_url', 'release_title', 'release_date', 'upc', 'popularity',
       'total_tracks', 'album_type', 'release_img', 'label_name', 'artist_id',
       'artist_name', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

In [1]:
merged_spotify.to_csv('spotify_complete_data.csv')
merged_spotify.head()

NameError: name 'merged_spotify' is not defined

In [11]:
merged_spotify.head(100).to_csv('spotify_complete_data_extract.csv')