In [2]:
import numpy as np
import pypianoroll
import os
from collections import defaultdict
import pickle

In [3]:
nfiles_to_read=200
tempo_criteria=5

In [4]:
#load and extract timing information from a pianoroll file
def extract_timing_info(file_path):
    multitrack = pypianoroll.load(file_path)
    resolution = multitrack.resolution
    # Use of mean tempo
    tempo = np.mean(multitrack.tempo)  
    return resolution, tempo

# similarity criteria (same resolution and tempo within a certain range)
def is_similar_timing(resolution1, tempo1, resolution2, tempo2, tempo_threshold=5):
    if resolution1 != resolution2:
        return False
    if abs(tempo1 - tempo2) > tempo_threshold:
        return False
    return True



In [5]:
# Directory containing the npz files
root_directory = 'lpd_5_cleansed'

# Dictionary to group files by timing settings
groups = defaultdict(list)
npz_files = []

# Walk through the directory tree
for dirpath, _, filenames in os.walk(root_directory):
    for filename in filenames:
        if filename.endswith('.npz'):
            file_path = os.path.join(dirpath, filename)
            npz_files.append(file_path)
            if len(npz_files) >= nfiles_to_read:
                break
    if len(npz_files) >= nfiles_to_read:
        break

# Process the first nfiles_to_read npz files
for file_path in npz_files:
    resolution, tempo = extract_timing_info(file_path)
    
    # Check if this file can be grouped within an existing group
    grouped = False
    for key in groups:
        if is_similar_timing(resolution, tempo, *key,tempo_criteria):
            groups[key].append(file_path)
            grouped = True
            break
    
    # If not grouped, create a new group wiht new resolution and tempo
    if not grouped:
        groups[(resolution, tempo)].append(file_path)

# Output the grouped files
for key, file_list in groups.items():
    print(f"Group with Resolution: {key[0]}, Tempo: {key[1]:.2f} BPM : {len(file_list)} files")
    # for file_path in file_list:
    #     print(f" - {file_path}")

Group with Resolution: 24, Tempo: 122.00 BPM : 42 files
Group with Resolution: 24, Tempo: 105.00 BPM : 23 files
Group with Resolution: 24, Tempo: 67.00 BPM : 16 files
Group with Resolution: 24, Tempo: 114.40 BPM : 14 files
Group with Resolution: 24, Tempo: 91.00 BPM : 14 files
Group with Resolution: 24, Tempo: 76.02 BPM : 15 files
Group with Resolution: 24, Tempo: 180.00 BPM : 3 files
Group with Resolution: 24, Tempo: 130.00 BPM : 25 files
Group with Resolution: 24, Tempo: 85.00 BPM : 5 files
Group with Resolution: 24, Tempo: 172.18 BPM : 3 files
Group with Resolution: 24, Tempo: 156.00 BPM : 3 files
Group with Resolution: 24, Tempo: 100.00 BPM : 12 files
Group with Resolution: 24, Tempo: 166.00 BPM : 1 files
Group with Resolution: 24, Tempo: 137.23 BPM : 13 files
Group with Resolution: 24, Tempo: 148.01 BPM : 1 files
Group with Resolution: 24, Tempo: 60.00 BPM : 5 files
Group with Resolution: 24, Tempo: 190.00 BPM : 2 files
Group with Resolution: 24, Tempo: 200.00 BPM : 2 files
Group 

In [6]:
# Find the key with the biggest file numbers
key_with_most_files = max(groups.items(), key=lambda x: len(x[1]))

# Extract the key and the number of files
key, file_list = key_with_most_files
num_files = len(file_list)

print(f"Group with Resolution: {key[0]}, Tempo: {key[1]:.2f} BPM has the most files: {num_files} files")

Group with Resolution: 24, Tempo: 122.00 BPM has the most files: 42 files


In [7]:
# Get the first group which includes the 71 files
# first_group_key = next(iter(groups))
first_group_files = groups[key]

# Dictionary to group tracks by name
tracks_by_name = defaultdict(list)

# Process each file in the first group
for file_path in first_group_files:
    multitrack = pypianoroll.load(file_path)
    
    # Extract and group tracks by name
    for track in multitrack.tracks:
        tracks_by_name[track.name].append(track)

# Output the grouped tracks by name
for name, track_list in tracks_by_name.items():
    print(f"Track Name: {name}")
    for track in track_list:
        print(f" - Track from file tempo array length {len(track.pianoroll)}")


Track Name: Drums
 - Track from file tempo array length 11136
 - Track from file tempo array length 12096
 - Track from file tempo array length 10872
 - Track from file tempo array length 15552
 - Track from file tempo array length 1560
 - Track from file tempo array length 11328
 - Track from file tempo array length 14784
 - Track from file tempo array length 8376
 - Track from file tempo array length 16896
 - Track from file tempo array length 0
 - Track from file tempo array length 0
 - Track from file tempo array length 7488
 - Track from file tempo array length 11160
 - Track from file tempo array length 8376
 - Track from file tempo array length 10368
 - Track from file tempo array length 11904
 - Track from file tempo array length 10848
 - Track from file tempo array length 0
 - Track from file tempo array length 9432
 - Track from file tempo array length 14040
 - Track from file tempo array length 9240
 - Track from file tempo array length 11520
 - Track from file tempo array l

In [8]:
tracks_by_name['Drums']

[StandardTrack(name='Drums', program=0, is_drum=True, pianoroll=array(shape=(11136, 128), dtype=uint8)),
 StandardTrack(name='Drums', program=0, is_drum=True, pianoroll=array(shape=(12096, 128), dtype=uint8)),
 StandardTrack(name='Drums', program=0, is_drum=True, pianoroll=array(shape=(10872, 128), dtype=uint8)),
 StandardTrack(name='Drums', program=0, is_drum=True, pianoroll=array(shape=(15552, 128), dtype=uint8)),
 StandardTrack(name='Drums', program=0, is_drum=True, pianoroll=array(shape=(1560, 128), dtype=uint8)),
 StandardTrack(name='Drums', program=0, is_drum=True, pianoroll=array(shape=(11328, 128), dtype=uint8)),
 StandardTrack(name='Drums', program=0, is_drum=True, pianoroll=array(shape=(14784, 128), dtype=uint8)),
 StandardTrack(name='Drums', program=0, is_drum=True, pianoroll=array(shape=(8376, 128), dtype=uint8)),
 StandardTrack(name='Drums', program=0, is_drum=True, pianoroll=array(shape=(16896, 128), dtype=uint8)),
 BinaryTrack(name='Drums', program=0, is_drum=True, piano

#### Important Note

##### We may also need to further group the tracks from the same instrument which has similar resoltion together

In [9]:
drums_list=list()
piano_list=list()
guitar_list=list()
bass_list=list()
strings_list=list()

for track in tracks_by_name['Drums']:
    if len(track.pianoroll)!= 0:
        drums_list.append(track)
for track in tracks_by_name['Piano']:
    if len(track.pianoroll)!= 0:
        piano_list.append(track)
for track in tracks_by_name['Guitar']:
    if len(track.pianoroll)!= 0:
        guitar_list.append(track)
for track in tracks_by_name['Bass']:
    if len(track.pianoroll)!= 0:
        bass_list.append(track)
for track in tracks_by_name['Strings']:
    if len(track.pianoroll)!= 0:
        strings_list.append(track)

In [10]:
dataset={}
dataset["description"]="A dataset of 5 instruments Drums,Piano,Guitar,Bass, and Strings, these files have Resolution: 24 &  Tempo: 122.00 BPM with difference of 5% "
dataset["Drums"]=drums_list
dataset["Piano"]=piano_list
dataset["Guitar"]=guitar_list
dataset["Bass"]=bass_list
dataset["Strings"]=strings_list


In [11]:
pickle.dump( dataset, open( "dataset.pickle", "wb" ) )