In [3]:
dadagp_path = '/Users/pedro/Desktop/DadaGP-v1.1/'
all_files_path = '/Users/pedro/Desktop/DadaGP-v1.1/_DadaGP_all_filenames.json'

## Get all .gp files

In [12]:
import json

# Assuming 'all_files_path' contains the path to your JSON file
with open(all_files_path, 'r') as f:
    file_paths = json.load(f)  # Load JSON data into a Python list

# Remove the ".tokens.txt" part from each path
all_gps_paths = [path.replace(".tokens.txt", "") for path in file_paths]

# Print the number of files
print(len(all_gps_paths))

26181


## Get all .gp files with => 4 tracks

In [25]:
import guitarpro as gp
import os
from tqdm import tqdm

gp_files_4plus = []

# Iterate over the list of .gp file paths
for gp_file_path in tqdm(all_gps_paths):
    # Get full path
    file_path = os.path.join(dadagp_path, gp_file_path)
    # Parse them with pyguitarpro
    try:
        # Parse the Guitar Pro file
        gp_file = gp.parse(file_path)

        # Check the number of tracks
        if len(gp_file.tracks) >= 4:
            gp_files_4plus.append(file_path)
            # print('File {} has {} tracks.'.format(file_path, len(gp_file.tracks)))

    except Exception as e:
        # Handle any parsing errors gracefully
        print('Could not parse {}: {}'.format(file_path, e))
    

100%|██████████| 26181/26181 [30:38<00:00, 14.24it/s]  


In [33]:
print('There are {} files with 4 or more tracks, which is {:.2f}% of the dataset.'.format(len(gp_files_4plus), 100*len(gp_files_4plus)/len(all_gps_paths)))

There are 13413 files with 4 or more tracks, which is 51.23% of the dataset.


In [34]:
# Save gp_files_4plus into a .json
with open("gp_paths_4plustracks.json", "w") as final:
	json.dump(gp_files_4plus, final)

## Remove extra tracks from .gp files

In [38]:
# If a track only has 2 distorted guitars, bass, drums -> save the track
# If a track has more than those instruments, check if it has 2 DGs, B, D -> save those in new file

with open('gp_paths_4plustracks.json', 'r') as f:
    gp_files_4plus = json.load(f)  # Load JSON data into a Python list

13413


In [220]:
import os
import shutil
from tqdm import tqdm

# Directory to save filtered files
output_directory = "/Users/pedro/Desktop/DadaGP-v1.1/data-prep-acc-gen/gps_with_4/"

# Make sure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# MIDI Channel numbers for the required instruments
DISTORTED_GUITAR_CHANNELS = [29, 30]  # Distorted/Overdriven guitars
BASS_CHANNEL = [32, 33, 34, 35, 36, 37, 38, 39]  # Finger/Picked Bass
# No need to define drums channel as we'll use isPercussionTrack for this

# Function to check if a track is a distorted guitar, bass, or drums
def is_required_instrument(track):
    instrument = track.channel.instrument
    if track.isPercussionTrack:  # Check if it's a percussion (drum) track
        return 'drums'
    elif instrument in DISTORTED_GUITAR_CHANNELS:
        return 'distorted_guitar'
    elif instrument in BASS_CHANNEL:
        return 'bass'
    
    return None

for file in tqdm(gp_files_4plus):
    try:
        # Parse the Guitar Pro file
        gp_file = gp.parse(file)
        _ = gp_file
        # print(gp_file)
        filename = file.split('/')[-1]
        # print(filename)

        # Initialize lists to store required instruments
        distorted_guitars = []
        bass = None
        drums = None

        # Filter tracks based on MIDI channel and percussion check
        for track in gp_file.tracks:
            # print(track)
            instrument_type = is_required_instrument(track)
            if instrument_type == 'distorted_guitar' and len(distorted_guitars) < 2:
                distorted_guitars.append(track)
            elif instrument_type == 'bass' and bass is None:
                bass = track
            elif instrument_type == 'drums' and drums is None:
                drums = track

        # Check if we have the required tracks (2 distorted guitars, 1 bass, 1 drums)
        if len(distorted_guitars) == 2 and bass and drums:
            required_tracks = distorted_guitars + [bass, drums]
            # print(required_tracks)

            if len(gp_file.tracks) == 4:
                # If the file already has exactly 4 tracks, just copy it
                output_file = os.path.join(output_directory, filename)
                shutil.copy(file, output_file)
                # print("Copied {} to {}.".format(filename, output_file))
            else:
                # _ was a copy of the current .gp file with all tracks
                # we need to clear its tracks
                _.tracks = []
                for track in required_tracks:
                    # print(track)
                    _.tracks.append(track)
                gp.write(_, output_directory + filename)
                # print("Created new file with 4 tracks: {}".format(output_directory + filename))

    except Exception as e:
        print("Could not process {}: {}".format(gp_file, e))



  0%|          | 0/13413 [00:00<?, ?it/s]

100%|██████████| 13413/13413 [35:32<00:00,  6.29it/s] 


## Assert all files have four correct tracks

In [221]:
import os

gps_path = '/Users/pedro/Desktop/DadaGP-v1.1/data-prep-acc-gen/gps_with_4'
files_4tracks = []
for file in os.listdir(gps_path):
    files_4tracks.append(file)

print('There are {} files with 4 tracks, which is {:.2f}% of the dataset.'.format(len(files_4tracks), 100*len(files_4tracks)/len(all_gps_paths)))

There are 7725 files with 4 tracks, which is 29.51% of the dataset.


In [228]:
x = ['dg', 'b', 'dg', 'd']

print(set(x) == set(['dg', 'dg', 'b', 'd']))

True


In [None]:
import os

gps_path = '/Users/pedro/Desktop/DadaGP-v1.1/data-prep-acc-gen/gps_with_4/'

DISTORTED_GUITAR_CHANNELS = [29, 30]  # Distorted/Overdriven guitars
BASS_CHANNEL = [32, 33, 34, 35, 36, 37, 38, 39]  # Finger/Picked Bass

corrupted_list = []

for file in tqdm(os.listdir(gps_path)):
    try:
        # print(file)
        track_list = []
        gp_ = gp.parse(gps_path + file)
        assert len(gp_.tracks) == 4
        for track in gp_.tracks:
            if track.channel.instrument in DISTORTED_GUITAR_CHANNELS:
                track_list.append('dg')
            elif track.isPercussionTrack:
                track_list.append('d')
            elif track.channel.instrument in BASS_CHANNEL:
                track_list.append('b')

        assert set(track_list) == set(['dg', 'dg', 'b', 'd'])

    except Exception as e:
        print("Could not process {}: {}".format(file, e))
        corrupted_list.append(file)



In [238]:
len(corrupted_list)

64

## Removing corrupted entries

In [239]:
import os

# Define the path to the folder containing the files
folder_path = "/Users/pedro/Desktop/DadaGP-v1.1/data-prep-acc-gen/gps_with_4"

print('Before: {} files'.format(len(files_4tracks)))

# Iterate through the list and remove the files
for file_name in corrupted_list:
    file_path = os.path.join(folder_path, file_name)
    if os.path.exists(file_path):
        os.remove(file_path)
        print("Removed: {}".format(file_name))
    else:
        print("File not found: {}".format(file_name))

files_4tracks_clean = []
for file in os.listdir(gps_path):
    files_4tracks_clean.append(file)

print('After: {} files.'.format(len(files_4tracks_clean)))

Before: 7725 files
Removed: Black Sabbath - Paranoid (5).gp3
Removed: Disturbed - Conflict (2).gp3
Removed: All That Remains - Regret Not.gp4
Removed: Whitesnake - Fool For Your Loving.gp3
Removed: Megadeth - Promises (3).gp4
Removed: System Of A Down - Chop Suey!.gp4
Removed: Osbourne, Ozzy - I Don_'t Know (2).gp3
Removed: Ruiz, Jacques - Star Life.gp3
Removed: Scorpions - Rock You Like A Hurricane (2).gp3
Removed: Scorpions - Rock You Like A Hurricane (3).gp3
Removed: Led Zeppelin - The Wanton Song.gp3
Removed: Aerosmith - Livin' on the Edge.gp3
Removed: H_tk_znapi Csal_d_sok - Target.gp4
Removed: Hermetica - Olvidalo y volvera por mas.gp3
Removed: Springsteen, Bruce - Further On (Up the Road).gp3
Removed: ALMAFUERTE - TC.gp3
Removed: Nahui Ollin - Solo Por Tu Amor.gp4
Removed: Audioslave - Cochise (2).gp3
Removed: Scorpions - Rock You Like a Hurricane.gp3
Removed: Metallica - Whiplash (2).gp3
Removed: Metallica - The Outlaw Torn.gp3
Removed: Pixies - Velouria.gp3
Removed: Guns N' Ro