### Join the beatport dataframes

In [2]:
import pandas as pd

In [None]:
def load_beatport_data():
    """Load all Beatport CSV files into dataframes"""
    files = {
        'artists': 'data/bp_artist.csv',
        'artist_media': 'data/bp_artist_media.csv',
        'artist_release': 'data/bp_artist_release.csv',
        'artist_track': 'data/bp_artist_track.csv',
        'genres': 'data/bp_genre.csv',
        'keys': 'data/bp_key.csv',
        'label_artist': 'data/bp_label_artist.csv',
        'labels': 'data/bp_label.csv',
        'label_media': 'data/bp_label_media.csv',
        'releases': 'data/bp_release.csv',
        'release_media': 'data/bp_release_media.csv',
        'subgenres': 'data/bp_subgenre.csv',
        'tracks': 'data/bp_track.csv',
        'track_media': 'data/bp_track_media.csv'
}
    
    dfs = {}
    for key, file in files.items():
        dfs[key] = pd.read_csv(file)
    return dfs

def create_complete_dataset():
    """Create complete dataset by joining all Beatport data"""
    # Load all dataframes
    print("Loading data...")
    dfs = load_beatport_data()
    
    print("Starting joins...")
    
    # 1. Start with tracks and add track media
    print("Processing track information...")
    df = pd.merge(
        dfs['tracks'],
        dfs['track_media'],
        on='track_id',
        how='left',
        suffixes=('', '_track_media')
    )
    
    # 2. Add key information
    print("Adding key information...")
    df = pd.merge(
        df,
        dfs['keys'],
        on='key_id',
        how='left'
    )
    
    # 3. Add genre information
    print("Adding genre and subgenre information...")
    df = pd.merge(
        df,
        dfs['genres'],
        on='genre_id',
        how='left'
    )
    
    df = pd.merge(
        df,
        dfs['subgenres'],
        on=['genre_id', 'genre_url'],
        how='left',
        suffixes=('', '_subgenre')
    )
    
    # 4. Add release information
    print("Adding release information...")
    df = pd.merge(
        df,
        dfs['releases'],
        on=['release_id', 'label_id'],
        how='left',
        suffixes=('', '_release')
    )
    
    # 5. Add release media
    df = pd.merge(
        df,
        dfs['release_media'],
        on='release_id',
        how='left',
        suffixes=('', '_release_media')
    )
    
    # 6. Add label information
    print("Adding label information...")
    df = pd.merge(
        df,
        dfs['labels'],
        on='label_id',
        how='left',
        suffixes=('', '_label')
    )
    
    df = pd.merge(
        df,
        dfs['label_media'],
        on='label_id',
        how='left',
        suffixes=('', '_label_media')
    )
    
    # 7. Add artist relationships through artist_track
    print("Adding artist information...")
    df = pd.merge(
        df,
        dfs['artist_track'],
        on='track_id',
        how='left',
        suffixes=('', '_artist_track')
    )
    
    # 8. Add artist details
    df = pd.merge(
        df,
        dfs['artists'],
        on='artist_id',
        how='left',
        suffixes=('', '_artist')
    )
    
    # 9. Add artist media
    df = pd.merge(
        df,
        dfs['artist_media'],
        on='artist_id',
        how='left',
        suffixes=('', '_artist_media')
    )
    
    # Clean up columns
    print("Cleaning up the dataset...")
    # Remove duplicate updated_on columns
    columns_to_drop = [col for col in df.columns if col.startswith('updated_on_')]
    # Remove duplicate URL columns and other duplicates
    columns_to_drop.extend([col for col in df.columns if col.endswith(('_x', '_y'))])
    df = df.drop(columns=columns_to_drop)
    
    return df

def generate_dataset_report(df):
    """Generate a report about the dataset"""
    print("\nDataset Report:")
    print("-" * 50)
    print(f"Total number of rows: {len(df)}")
    print(f"Total number of columns: {len(df.columns)}")
    print("\nUnique counts:")
    print(f"Tracks: {df['track_id'].nunique()}")
    print(f"Artists: {df['artist_id'].nunique()}")
    print(f"Labels: {df['label_id'].nunique()}")
    print(f"Releases: {df['release_id'].nunique()}")
    print(f"Genres: {df['genre_id'].nunique()}")
    print(f"Subgenres: {df['subgenre_id'].nunique()}")
    print(f"Keys: {df['key_id'].nunique()}")
    
    print("\nMissing values summary:")
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    print(missing)

def main():
    # Create the complete dataset
    print("Starting data integration process...")
    final_df = create_complete_dataset()
    
    # Generate and display report
    generate_dataset_report(final_df)
    
    # Save the final dataset
    print("\nSaving complete dataset...")
    final_df.to_csv('beatport_complete_data.csv', index=False)
    print("Dataset saved successfully!")
    
    return final_df

if __name__ == "__main__":
    final_df = main()

Starting data integration process...
Loading data...


  dfs[key] = pd.read_csv(file)


Starting joins...
Processing track information...
Adding key information...
Adding genre and subgenre information...
Adding release information...
Adding label information...
Adding artist information...
Cleaning up the dataset...

Dataset Report:
--------------------------------------------------
Total number of rows: 36900320
Total number of columns: 52

Unique counts:
Tracks: 10685331
Artists: 824763
Labels: 78283
Releases: 2598492
Genres: 32
Subgenres: 72
Keys: 34

Missing values summary:
title                        157
mix                            9
subgenre_id             32031014
duration                   10062
duration_ms                10062
isrc                      914141
key_id                     20539
is_matched_spot         12760617
wave_img_id                55648
wave_img_uuid              55648
sample_start               10696
sample_end                 10696
key_letter                 20539
key_name                   20539
camelot_num                20539
camelot

In [1]:

import pandas as pd
import os

# Create output directory
output_dir = "split_by_genre"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Dictionary to track files
genre_files = {}
chunk_number = 0

# Process the file in chunks
print("Starting to process file in chunks...")
for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):
    chunk_number += 1
    print(f"Processing chunk {chunk_number}...")
    
    # Process each genre in the chunk
    for genre in chunk['genre_name'].unique():
        # Create safe filename
        safe_genre = "".join(c if c.isalnum() or c in (' ', '-') else '_' for c in genre).rstrip()
        filename = os.path.join(output_dir, f"beatport_{safe_genre}.csv")
        
        # Get rows for this genre
        genre_chunk = chunk[chunk['genre_name'] == genre]
        
        # Append or create new file
        if genre in genre_files:
            genre_chunk.to_csv(filename, mode='a', header=False, index=False)
        else:
            genre_chunk.to_csv(filename, mode='w', header=True, index=False)
            genre_files[genre] = filename
            print(f"Created new file for genre: {genre}")
    
    # Clear memory
    del chunk

print("\nFinished processing!")
print(f"Split into {len(genre_files)} files:")
for genre, filepath in genre_files.items():
    file_size = os.path.getsize(filepath) / (1024 * 1024)  # Convert to MB
    print(f"- {genre}: {file_size:.2f} MB")


Starting to process file in chunks...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 1...
Created new file for genre: Trance (Raw / Deep / Hypnotic)
Created new file for genre: Techno (Raw / Deep / Hypnotic)
Created new file for genre: Tech House
Created new file for genre: Amapiano
Created new file for genre: Deep House
Created new file for genre: Electro (Classic / Detroit / Modern)
Created new file for genre: 140 / Deep Dubstep / Grime
Created new file for genre: Bass / Club
Created new file for genre: Jackin House
Created new file for genre: UK Garage / Bassline


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 2...
Created new file for genre: Bass House
Created new file for genre: Afro House
Created new file for genre: Hard Techno


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 3...
Created new file for genre: Funky House
Created new file for genre: Organic House / Downtempo
Created new file for genre: Techno (Peak Time / Driving)


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 4...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 5...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 6...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 7...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 8...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 9...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 10...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 11...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 12...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 13...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 14...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 15...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 16...
Created new file for genre: Dubstep


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 17...
Created new file for genre: Psy-Trance
Processing chunk 18...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 19...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 20...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 21...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 22...
Created new file for genre: Nu Disco / Disco


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 23...
Created new file for genre: Hard Dance / Hardcore
Processing chunk 24...
Processing chunk 25...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 26...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 27...
Processing chunk 28...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 29...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 30...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 31...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 32...
Created new file for genre: Indie Dance
Created new file for genre: House


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 33...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 34...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 35...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 36...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 37...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 38...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 39...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 40...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 41...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 42...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 43...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 44...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 45...
Created new file for genre: Melodic House & Techno
Processing chunk 46...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 47...
Created new file for genre: Breaks / Breakbeat / UK Bass


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 48...
Created new file for genre: Drum & Bass


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 49...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 50...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 51...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 52...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 53...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 54...
Processing chunk 55...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 56...
Created new file for genre: Progressive House


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 57...
Created new file for genre: Trap / Wave
Created new file for genre: Mainstage


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 58...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 59...
Created new file for genre: Electronica


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 60...
Processing chunk 61...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 62...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 63...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 64...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 65...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 66...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 67...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 68...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 69...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 70...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 71...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 72...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 73...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 74...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 75...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 76...
Processing chunk 77...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 78...
Processing chunk 79...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 80...
Processing chunk 81...
Processing chunk 82...
Processing chunk 83...
Processing chunk 84...
Processing chunk 85...
Processing chunk 86...
Processing chunk 87...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 88...
Processing chunk 89...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 90...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 91...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 92...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 93...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 94...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 95...
Created new file for genre: Trance (Main Floor)
Processing chunk 96...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 97...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 98...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 99...
Processing chunk 100...
Processing chunk 101...
Processing chunk 102...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 103...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 104...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 105...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 106...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 107...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 108...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 109...
Created new file for genre: Minimal / Deep Tech


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 110...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 111...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 112...
Created new file for genre: Dance / Electro Pop
Processing chunk 113...
Processing chunk 114...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 115...
Processing chunk 116...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 117...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 118...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 119...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 120...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 121...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 122...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 123...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 124...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 125...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 126...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 127...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 128...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 129...
Processing chunk 130...
Processing chunk 131...
Processing chunk 132...
Processing chunk 133...
Processing chunk 134...
Processing chunk 135...
Processing chunk 136...
Processing chunk 137...
Processing chunk 138...
Processing chunk 139...
Processing chunk 140...
Processing chunk 141...
Processing chunk 142...
Processing chunk 143...
Processing chunk 144...
Processing chunk 145...
Processing chunk 146...
Processing chunk 147...
Processing chunk 148...
Processing chunk 149...
Processing chunk 150...
Processing chunk 151...
Processing chunk 152...
Processing chunk 153...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 154...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 155...
Processing chunk 156...
Processing chunk 157...
Processing chunk 158...
Processing chunk 159...
Processing chunk 160...
Processing chunk 161...
Processing chunk 162...
Processing chunk 163...
Processing chunk 164...
Processing chunk 165...
Processing chunk 166...
Processing chunk 167...
Processing chunk 168...
Processing chunk 169...
Processing chunk 170...
Processing chunk 171...
Processing chunk 172...
Processing chunk 173...
Processing chunk 174...
Processing chunk 175...
Processing chunk 176...
Processing chunk 177...
Processing chunk 178...
Processing chunk 179...
Processing chunk 180...
Processing chunk 181...
Processing chunk 182...
Processing chunk 183...
Processing chunk 184...
Processing chunk 185...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 186...
Processing chunk 187...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 188...
Processing chunk 189...
Processing chunk 190...
Processing chunk 191...
Processing chunk 192...
Processing chunk 193...
Processing chunk 194...
Processing chunk 195...
Processing chunk 196...
Processing chunk 197...
Processing chunk 198...
Processing chunk 199...
Processing chunk 200...
Processing chunk 201...
Processing chunk 202...
Processing chunk 203...
Processing chunk 204...
Processing chunk 205...
Processing chunk 206...
Processing chunk 207...
Processing chunk 208...
Processing chunk 209...
Processing chunk 210...
Processing chunk 211...
Processing chunk 212...
Processing chunk 213...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 214...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 215...
Processing chunk 216...
Processing chunk 217...
Processing chunk 218...
Processing chunk 219...
Processing chunk 220...
Processing chunk 221...
Processing chunk 222...
Processing chunk 223...
Processing chunk 224...
Processing chunk 225...
Processing chunk 226...
Processing chunk 227...
Processing chunk 228...
Processing chunk 229...
Processing chunk 230...
Processing chunk 231...
Processing chunk 232...
Processing chunk 233...
Processing chunk 234...
Processing chunk 235...
Processing chunk 236...
Processing chunk 237...
Processing chunk 238...
Processing chunk 239...
Processing chunk 240...
Processing chunk 241...
Processing chunk 242...
Processing chunk 243...
Processing chunk 244...
Processing chunk 245...
Processing chunk 246...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 247...
Processing chunk 248...
Processing chunk 249...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 250...
Processing chunk 251...
Processing chunk 252...
Processing chunk 253...
Processing chunk 254...
Processing chunk 255...
Processing chunk 256...
Processing chunk 257...
Processing chunk 258...
Processing chunk 259...
Processing chunk 260...
Processing chunk 261...
Processing chunk 262...
Processing chunk 263...
Processing chunk 264...
Processing chunk 265...
Processing chunk 266...
Processing chunk 267...
Processing chunk 268...
Processing chunk 269...
Processing chunk 270...
Processing chunk 271...
Processing chunk 272...
Processing chunk 273...
Processing chunk 274...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 275...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 276...
Processing chunk 277...
Processing chunk 278...
Processing chunk 279...
Processing chunk 280...
Processing chunk 281...
Processing chunk 282...
Processing chunk 283...
Processing chunk 284...
Processing chunk 285...
Processing chunk 286...
Processing chunk 287...
Processing chunk 288...
Processing chunk 289...
Processing chunk 290...
Processing chunk 291...
Processing chunk 292...
Processing chunk 293...
Processing chunk 294...
Processing chunk 295...
Processing chunk 296...
Processing chunk 297...
Processing chunk 298...
Processing chunk 299...
Processing chunk 300...
Processing chunk 301...
Processing chunk 302...
Processing chunk 303...
Processing chunk 304...
Processing chunk 305...
Processing chunk 306...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 307...
Processing chunk 308...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 309...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 310...
Processing chunk 311...
Processing chunk 312...
Processing chunk 313...
Processing chunk 314...
Processing chunk 315...
Processing chunk 316...
Processing chunk 317...
Processing chunk 318...
Processing chunk 319...
Processing chunk 320...
Processing chunk 321...
Processing chunk 322...
Processing chunk 323...
Processing chunk 324...
Processing chunk 325...
Processing chunk 326...
Processing chunk 327...
Processing chunk 328...
Processing chunk 329...
Processing chunk 330...
Processing chunk 331...
Processing chunk 332...
Processing chunk 333...
Processing chunk 334...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 335...


  for chunk in pd.read_csv("beatport_complete_data.csv", chunksize=100000):


Processing chunk 336...
Processing chunk 337...
Processing chunk 338...
Processing chunk 339...
Processing chunk 340...
Processing chunk 341...
Processing chunk 342...
Processing chunk 343...
Processing chunk 344...
Processing chunk 345...
Processing chunk 346...
Processing chunk 347...
Processing chunk 348...
Processing chunk 349...
Processing chunk 350...
Processing chunk 351...
Processing chunk 352...
Processing chunk 353...
Processing chunk 354...
Processing chunk 355...
Processing chunk 356...
Processing chunk 357...
Processing chunk 358...
Processing chunk 359...
Processing chunk 360...
Processing chunk 361...
Processing chunk 362...
Processing chunk 363...
Processing chunk 364...
Processing chunk 365...
Processing chunk 366...
Processing chunk 367...
Processing chunk 368...
Processing chunk 369...
Processing chunk 370...

Finished processing!
Split into 32 files:
- Trance (Raw / Deep / Hypnotic): 5.27 MB
- Techno (Raw / Deep / Hypnotic): 524.03 MB
- Tech House: 927.32 MB
- Amapi

In [3]:
techno_raw_deep_hypnotic_df = pd.read_csv("split_by_genre/beatport_techno_raw_deep_hypnotic.csv")

  techno_raw_deep_hypnotic_df = pd.read_csv("split_by_genre/beatport_techno_raw_deep_hypnotic.csv")


In [7]:
techno_raw_deep_hypnotic_df.head(5)

Unnamed: 0,track_id,title,mix,is_remixed,release_date,genre_id,subgenre_id,track_url,bpm,duration,...,label_name,label_url,label_img_id,label_img_uuid,artist_id,is_remixer,artist_name,artist_url,artist_img_id,artist_img_uuid
0,387142,Avion,Original Mix,f,1994-06-22,92,,beatport.com/track/avion/387142,130,5:52,...,Synewave,beatport.com/label/synewave/2587,3231998,0d3916f7-e37e-4514-a952-a0187452096f,8004,f,Damon Wild,beatport.com/artist/damon-wild/8004,245396,48f7f2f3-cc67-450f-9e47-1c0269dac7ae
1,387142,Avion,Original Mix,f,1994-06-22,92,,beatport.com/track/avion/387142,130,5:52,...,Synewave,beatport.com/label/synewave/2587,3231998,0d3916f7-e37e-4514-a952-a0187452096f,8004,f,Damon Wild,beatport.com/artist/damon-wild/8004,245396,48f7f2f3-cc67-450f-9e47-1c0269dac7ae
2,387142,Avion,Original Mix,f,1994-06-22,92,,beatport.com/track/avion/387142,130,5:52,...,Synewave,beatport.com/label/synewave/2587,3231998,0d3916f7-e37e-4514-a952-a0187452096f,8004,f,Damon Wild,beatport.com/artist/damon-wild/8004,245396,48f7f2f3-cc67-450f-9e47-1c0269dac7ae
3,387142,Avion,Original Mix,f,1994-06-22,92,,beatport.com/track/avion/387142,130,5:52,...,Synewave,beatport.com/label/synewave/2587,3231998,0d3916f7-e37e-4514-a952-a0187452096f,8004,f,Damon Wild,beatport.com/artist/damon-wild/8004,245396,48f7f2f3-cc67-450f-9e47-1c0269dac7ae
4,387142,Avion,Original Mix,f,1994-06-22,92,,beatport.com/track/avion/387142,130,5:52,...,Synewave,beatport.com/label/synewave/2587,3231998,0d3916f7-e37e-4514-a952-a0187452096f,8004,f,Damon Wild,beatport.com/artist/damon-wild/8004,245396,48f7f2f3-cc67-450f-9e47-1c0269dac7ae


In [4]:
techno_raw_deep_hypnotic_df.columns

Index(['track_id', 'title', 'mix', 'is_remixed', 'release_date', 'genre_id',
       'subgenre_id', 'track_url', 'bpm', 'duration', 'duration_ms', 'isrc',
       'key_id', 'label_id', 'release_id', 'is_matched_spot', 'wave_img_id',
       'wave_img_uuid', 'sample_uuid', 'sample_start', 'sample_end',
       'key_letter', 'key_name', 'camelot_num', 'camelot_letter', 'is_sharp',
       'is_flat', 'chord_id', 'chord_name', 'genre_name', 'song_count',
       'genre_url', 'subgenre_id_subgenre', 'subgenre_name',
       'song_count_subgenre', 'subgenre_url', 'updated_on', 'release_title',
       'release_date_release', 'release_url', 'release_img_id',
       'release_img_uuid', 'label_name', 'label_url', 'label_img_id',
       'label_img_uuid', 'artist_id', 'is_remixer', 'artist_name',
       'artist_url', 'artist_img_id', 'artist_img_uuid'],
      dtype='object')

In [9]:
techno_raw_deep_hypnotic_df["is_matched_spot"].dtypes


dtype('O')

In [10]:
def remove_sub_genre(filename):
    df = pd.read_csv(filename)
    df = df.drop(columns=['subgenre_id', 'subgenre_id_subgenre', 'subgenre_name', 'song_count_subgenre', 'subgenre_url', 'updated_on', "is_matched_spot"])
    df = df.drop_duplicates(subset=['track_id'])
    df.to_csv(filename, '_no_subgenre', index=False)


In [11]:
import pandas as pd
import os

def remove_sub_genre(filename):
    df = pd.read_csv(filename)
    df = df.drop(columns=['subgenre_id', 'subgenre_id_subgenre', 'subgenre_name', 'song_count_subgenre', 'subgenre_url', 'updated_on', "is_matched_spot"])
    df = df.drop_duplicates(subset=['track_id'])
    # Fix the output filename creation - using os.path.join and string formatting
    output_filename = os.path.join(os.path.dirname(filename), f"{os.path.splitext(os.path.basename(filename))[0]}_no_subgenre.csv")
    df.to_csv(output_filename, index=False)
    print(f"Processed {filename} -> {output_filename}")

# Path to your folder
folder_path = "split_by_genre"

# Iterate through all CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        full_path = os.path.join(folder_path, filename)
        remove_sub_genre(full_path)

  df = pd.read_csv(filename)


Processed split_by_genre\beatport_afro_house.csv -> split_by_genre\beatport_afro_house_no_subgenre.csv
Processed split_by_genre\beatport_amapiano.csv -> split_by_genre\beatport_amapiano_no_subgenre.csv
Processed split_by_genre\beatport_Bass House.csv -> split_by_genre\beatport_Bass House_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_bass_club.csv -> split_by_genre\beatport_bass_club_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_breaks__breakbeat_UKBass.csv -> split_by_genre\beatport_breaks__breakbeat_UKBass_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_dance_electro_pop.csv -> split_by_genre\beatport_dance_electro_pop_no_subgenre.csv
Processed split_by_genre\beatport_deep_dubstep_grime.csv -> split_by_genre\beatport_deep_dubstep_grime_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_deep_house.csv -> split_by_genre\beatport_deep_house_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_drum_bass.csv -> split_by_genre\beatport_drum_bass_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_dubstep.csv -> split_by_genre\beatport_dubstep_no_subgenre.csv
Processed split_by_genre\beatport_electro _classic_detroit .csv -> split_by_genre\beatport_electro _classic_detroit _no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_electronica.csv -> split_by_genre\beatport_electronica_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_funky_house.csv -> split_by_genre\beatport_funky_house_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_hard_dance_hardcore.csv -> split_by_genre\beatport_hard_dance_hardcore_no_subgenre.csv
Processed split_by_genre\beatport_hard_techno.csv -> split_by_genre\beatport_hard_techno_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_house.csv -> split_by_genre\beatport_house_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_indie_dance.csv -> split_by_genre\beatport_indie_dance_no_subgenre.csv
Processed split_by_genre\beatport_jackin_house.csv -> split_by_genre\beatport_jackin_house_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_mainstage.csv -> split_by_genre\beatport_mainstage_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_melodic_house_techno.csv -> split_by_genre\beatport_melodic_house_techno_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_minimal_deep_tech.csv -> split_by_genre\beatport_minimal_deep_tech_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_nu_disco.csv -> split_by_genre\beatport_nu_disco_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_organic_house_downtempo.csv -> split_by_genre\beatport_organic_house_downtempo_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_progressive_house.csv -> split_by_genre\beatport_progressive_house_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_psy_trance.csv -> split_by_genre\beatport_psy_trance_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_techno_peak_time.csv -> split_by_genre\beatport_techno_peak_time_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_techno_raw_deep_hypnotic.csv -> split_by_genre\beatport_techno_raw_deep_hypnotic_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_tech_house.csv -> split_by_genre\beatport_tech_house_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_trance.csv -> split_by_genre\beatport_trance_no_subgenre.csv
Processed split_by_genre\beatport_trance_raw_deep_hypnotic_.csv -> split_by_genre\beatport_trance_raw_deep_hypnotic__no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_trap_wave.csv -> split_by_genre\beatport_trap_wave_no_subgenre.csv


  df = pd.read_csv(filename)


Processed split_by_genre\beatport_uk_garage_bassline.csv -> split_by_genre\beatport_uk_garage_bassline_no_subgenre.csv


In [15]:
techno_raw_deep_hypnotic_no_subgenre_df = pd.read_csv('split_by_genre_no_subgenre/beatport_techno_raw_deep_hypnotic_no_subgenre.csv')
techno_raw_deep_hypnotic_no_subgenre_df

Unnamed: 0,track_id,title,mix,is_remixed,release_date,genre_id,track_url,bpm,duration,duration_ms,...,label_name,label_url,label_img_id,label_img_uuid,artist_id,is_remixer,artist_name,artist_url,artist_img_id,artist_img_uuid
0,387142,Avion,Original Mix,f,1994-06-22,92,beatport.com/track/avion/387142,130,5:52,352466.0,...,Synewave,beatport.com/label/synewave/2587,3231998,0d3916f7-e37e-4514-a952-a0187452096f,8004,f,Damon Wild,beatport.com/artist/damon-wild/8004,245396,48f7f2f3-cc67-450f-9e47-1c0269dac7ae
1,473092,Rotary,Regis Mix,t,1994-06-22,92,beatport.com/track/rotary/473092,131,4:15,255012.0,...,Synewave,beatport.com/label/synewave/2587,3231998,0d3916f7-e37e-4514-a952-a0187452096f,8004,f,Damon Wild,beatport.com/artist/damon-wild/8004,245396,48f7f2f3-cc67-450f-9e47-1c0269dac7ae
2,3402856,001#5,Original Mix,f,2012-03-30,92,beatport.com/track/0015/3402856,116,7:44,464318.0,...,Rawax,beatport.com/label/rawax/22353,3942313,8c92d23c-54f7-4d27-a380-c8e9d73ed9d2,256549,f,Vualitron,beatport.com/artist/vualitron/256549,5539565,0dc61986-bccf-49d4-8fad-6b147ea8f327
3,3402852,001#1,Original Mix,f,2012-03-30,92,beatport.com/track/0011/3402852,116,6:25,385100.0,...,Rawax,beatport.com/label/rawax/22353,3942313,8c92d23c-54f7-4d27-a380-c8e9d73ed9d2,256549,f,Vualitron,beatport.com/artist/vualitron/256549,5539565,0dc61986-bccf-49d4-8fad-6b147ea8f327
4,3332498,Czt,Original Mix,f,2012-03-02,92,beatport.com/track/czt/3332498,125,11:15,675840.0,...,Silent Steps,beatport.com/label/silent-steps/18855,10060408,5dcc3ced-d42f-4a3a-9a35-5e8479ee664a,132943,f,The Noisemaker,beatport.com/artist/the-noisemaker/132943,6484800,7d660daa-a9aa-4ede-b92f-84bb1c18279c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125386,649022,Bienfait 2,Original Mix,f,2000-05-24,92,beatport.com/track/bienfait-2/649022,71,5:31,331386.0,...,Kompakt,beatport.com/label/kompakt/4530,11537448,2a4c2937-e97c-40a7-8be8-605c7a9b5169,4984,f,Jonas Bering,beatport.com/artist/jonas-bering/4984,12330837,7cb378b7-c612-48aa-82a2-15a86549ebd1
125387,5635314,Prowler,Original Mix,f,2001-12-17,92,beatport.com/track/prowler/5635314,67,7:40,460846.0,...,Counterbalance,beatport.com/label/counterbalance/119,324,af7e018a-3213-44c7-801c-8597077cc75b,4036,f,Surgeon,beatport.com/artist/surgeon/4036,42975,0c86b134-ccc0-4268-b7f9-e4fadd9cc00a
125388,5635344,"La Real, Pt. 1",Original Mix,f,2000-03-13,92,beatport.com/track/la-real-pt-1/5635344,134,6:12,372700.0,...,Counterbalance,beatport.com/label/counterbalance/119,324,af7e018a-3213-44c7-801c-8597077cc75b,4036,f,Surgeon,beatport.com/artist/surgeon/4036,42975,0c86b134-ccc0-4268-b7f9-e4fadd9cc00a
125389,17248613,Blow That Shit Out,Original Mix,f,1995-02-02,92,beatport.com/track/blow-that-shit-out/17248613,135,3:40,220800.0,...,Traxmen Records,beatport.com/label/traxmen-records/41074,9469993,2eb45127-56ea-4d36-a077-2f9eff0dcd13,6374,f,Robert Armani,beatport.com/artist/robert-armani/6374,31603800,99dde20c-2ece-4d40-a857-b286d0102b1a


In [16]:
df_complete_no_subgenre = pd.read_csv("beatport_complete_data.csv")
df_complete_no_subgenre = df_complete_no_subgenre.drop(columns=['subgenre_id', 'subgenre_id_subgenre', 'subgenre_name', 'song_count_subgenre', 'subgenre_url', 'updated_on', "is_matched_spot"])
df_complete_no_subgenre = df_complete_no_subgenre.drop_duplicates(subset=['track_id'])
df_complete_no_subgenre.to_csv("beatport_complete_data_no_subgenre.csv", index=False)

  df_complete_no_subgenre = pd.read_csv("beatport_complete_data.csv")


In [18]:
df_complete_no_subgenre.head(50).to_csv('beatport_complete_no_subgenre_sample.csv')

In [19]:
df_complete_spotify = pd.read_csv("complete/spotify_complete_data.csv")
df_complete_spotify.head(50).to_csv('complete/spotify_complete_sample.csv')
