In [3]:
dadagp_path = '/Users/pedro/Desktop/DadaGP-v1.1/'
all_files_path = '/Users/pedro/Desktop/DadaGP-v1.1/_DadaGP_all_filenames.json'

## Get all .gp files

In [12]:
import json

# Assuming 'all_files_path' contains the path to your JSON file
with open(all_files_path, 'r') as f:
    file_paths = json.load(f)  # Load JSON data into a Python list

# Remove the ".tokens.txt" part from each path
all_gps_paths = [path.replace(".tokens.txt", "") for path in file_paths]

# Print the number of files
print(len(all_gps_paths))

26181


## Get all .gp files with => 4 tracks

In [25]:
import guitarpro as gp
import os
from tqdm import tqdm

gp_files_4plus = []

# Iterate over the list of .gp file paths
for gp_file_path in tqdm(all_gps_paths):
    # Get full path
    file_path = os.path.join(dadagp_path, gp_file_path)
    # Parse them with pyguitarpro
    try:
        # Parse the Guitar Pro file
        gp_file = gp.parse(file_path)

        # Check the number of tracks
        if len(gp_file.tracks) >= 4:
            gp_files_4plus.append(file_path)
            # print('File {} has {} tracks.'.format(file_path, len(gp_file.tracks)))

    except Exception as e:
        # Handle any parsing errors gracefully
        print('Could not parse {}: {}'.format(file_path, e))
    

100%|██████████| 26181/26181 [30:38<00:00, 14.24it/s]  


In [33]:
print('There are {} files with 4 or more tracks, which is {:.2f}% of the dataset.'.format(len(gp_files_4plus), 100*len(gp_files_4plus)/len(all_gps_paths)))

There are 13413 files with 4 or more tracks, which is 51.23% of the dataset.


In [34]:
# Save gp_files_4plus into a .json
with open("gp_paths_4plustracks.json", "w") as final:
	json.dump(gp_files_4plus, final)

## Remove extra tracks from .gp files

In [38]:
# If a track only has 2 distorted guitars, bass, drums -> save the track
# If a track has more than those instruments, check if it has 2 DGs, B, D -> save those in new file

with open('gp_paths_4plustracks.json', 'r') as f:
    gp_files_4plus = json.load(f)  # Load JSON data into a Python list

13413


In [218]:
gp_file = gp.parse(gp_files_4plus[14])
_ = gp_file
filename = gp_files_4plus[14]

# Initialize lists to store required instruments
distorted_guitars = []
bass = None
drums = None

for track in gp_file.tracks:
        print(track.name, track.channel.instrument)
        instrument_type = is_required_instrument(track)
        if instrument_type == 'distorted_guitar' and len(distorted_guitars) < 2:
            print('dg')
            distorted_guitars.append(track)
        elif instrument_type == 'bass' and bass is None:
            print('bass')
            bass = track
        elif instrument_type == 'drums' and drums is None:
            print('drums')
            drums = track

print(distorted_guitars)

if len(distorted_guitars) == 2 and bass and drums:
    required_tracks = distorted_guitars + [bass, drums]
    print(required_tracks)

    _.tracks = []
    for track in required_tracks:
        # print(track)
        _.tracks.append(track)
    gp.write(_, 'a.gp5')
    print("Created new file with 4 tracks: {}".format(output_directory + filename))

Track 1 25
Track 2 32
bass
Track 3 27
Percussion 0
drums
Percussion 0
[]


In [220]:
import os
import shutil
from tqdm import tqdm

# Directory to save filtered files
output_directory = "/Users/pedro/Desktop/DadaGP-v1.1/data-prep-acc-gen/gps_with_4/"

# Make sure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# MIDI Channel numbers for the required instruments
DISTORTED_GUITAR_CHANNELS = [29, 30]  # Distorted/Overdriven guitars
BASS_CHANNEL = [32, 33, 34, 35, 36, 37, 38, 39]  # Finger/Picked Bass
# No need to define drums channel as we'll use isPercussionTrack for this

# Function to check if a track is a distorted guitar, bass, or drums
def is_required_instrument(track):
    instrument = track.channel.instrument
    if track.isPercussionTrack:  # Check if it's a percussion (drum) track
        return 'drums'
    elif instrument in DISTORTED_GUITAR_CHANNELS:
        return 'distorted_guitar'
    elif instrument in BASS_CHANNEL:
        return 'bass'
    
    return None

for file in tqdm(gp_files_4plus):
    try:
        # Parse the Guitar Pro file
        gp_file = gp.parse(file)
        _ = gp_file
        # print(gp_file)
        filename = file.split('/')[-1]
        # print(filename)

        # Initialize lists to store required instruments
        distorted_guitars = []
        bass = None
        drums = None

        # Filter tracks based on MIDI channel and percussion check
        for track in gp_file.tracks:
            # print(track)
            instrument_type = is_required_instrument(track)
            if instrument_type == 'distorted_guitar' and len(distorted_guitars) < 2:
                distorted_guitars.append(track)
            elif instrument_type == 'bass' and bass is None:
                bass = track
            elif instrument_type == 'drums' and drums is None:
                drums = track

        # Check if we have the required tracks (2 distorted guitars, 1 bass, 1 drums)
        if len(distorted_guitars) == 2 and bass and drums:
            required_tracks = distorted_guitars + [bass, drums]
            # print(required_tracks)

            if len(gp_file.tracks) == 4:
                # If the file already has exactly 4 tracks, just copy it
                output_file = os.path.join(output_directory, filename)
                shutil.copy(file, output_file)
                # print("Copied {} to {}.".format(filename, output_file))
            else:
                # _ was a copy of the current .gp file with all tracks
                # we need to clear its tracks
                _.tracks = []
                for track in required_tracks:
                    # print(track)
                    _.tracks.append(track)
                gp.write(_, output_directory + filename)
                # print("Created new file with 4 tracks: {}".format(output_directory + filename))

    except Exception as e:
        print("Could not process {}: {}".format(gp_file, e))



  0%|          | 0/13413 [00:00<?, ?it/s]

100%|██████████| 13413/13413 [35:32<00:00,  6.29it/s] 


## Assert all files have four correct tracks

In [221]:
import os

gps_path = '/Users/pedro/Desktop/DadaGP-v1.1/data-prep-acc-gen/gps_with_4'
files_4tracks = []
for file in os.listdir(gps_path):
    files_4tracks.append(file)

print('There are {} files with 4 tracks, which is {:.2f}% of the dataset.'.format(len(files_4tracks), 100*len(files_4tracks)/len(all_gps_paths)))

There are 7725 files with 4 tracks, which is 29.51% of the dataset.


In [228]:
x = ['dg', 'b', 'dg', 'd']

print(set(x) == set(['dg', 'dg', 'b', 'd']))

True


In [236]:
import os

gps_path = '/Users/pedro/Desktop/DadaGP-v1.1/data-prep-acc-gen/gps_with_4/'

DISTORTED_GUITAR_CHANNELS = [29, 30]  # Distorted/Overdriven guitars
BASS_CHANNEL = [32, 33, 34, 35, 36, 37, 38, 39]  # Finger/Picked Bass

corrupted_list = []

for file in tqdm(os.listdir(gps_path)):
    try:
        # print(file)
        track_list = []
        gp_ = gp.parse(gps_path + file)
        assert len(gp_.tracks) == 4
        for track in gp_.tracks:
            if track.channel.instrument in DISTORTED_GUITAR_CHANNELS:
                track_list.append('dg')
            elif track.isPercussionTrack:
                track_list.append('d')
            elif track.channel.instrument in BASS_CHANNEL:
                track_list.append('b')

        assert set(track_list) == set(['dg', 'dg', 'b', 'd'])

    except Exception as e:
        print("Could not process {}: {}".format(file, e))
        corrupted_list.append(file)



  1%|          | 43/7725 [00:04<09:26, 13.55it/s]

Could not process Black Sabbath - Paranoid (5).gp3: 255 is not a valid NoteType


  1%|          | 63/7725 [00:06<13:26,  9.50it/s]

Could not process Disturbed - Conflict (2).gp3: 


  5%|▍         | 352/7725 [00:44<13:08,  9.35it/s]

Could not process All That Remains - Regret Not.gp4: 


  8%|▊         | 612/7725 [01:18<12:32,  9.46it/s]

Could not process Whitesnake - Fool For Your Loving.gp3: 255 is not a valid NoteType


  8%|▊         | 636/7725 [01:21<13:26,  8.79it/s]

Could not process Megadeth - Promises (3).gp4: 


  9%|▊         | 667/7725 [01:24<12:12,  9.64it/s]

Could not process System Of A Down - Chop Suey!.gp4: 


 10%|█         | 781/7725 [01:38<11:29, 10.07it/s]

Could not process Osbourne, Ozzy - I Don_'t Know (2).gp3: 255 is not a valid NoteType


 11%|█         | 862/7725 [01:49<13:53,  8.23it/s]

Could not process Ruiz, Jacques - Star Life.gp3: 255 is not a valid NoteType


 11%|█         | 868/7725 [01:49<12:32,  9.11it/s]

Could not process Scorpions - Rock You Like A Hurricane (2).gp3: 255 is not a valid NoteType


 13%|█▎        | 1025/7725 [02:08<14:31,  7.68it/s]

Could not process Scorpions - Rock You Like A Hurricane (3).gp3: 255 is not a valid NoteType


 15%|█▍        | 1128/7725 [02:21<08:50, 12.44it/s]

Could not process Led Zeppelin - The Wanton Song.gp3: unpack requires a bytes object of length 4


 20%|██        | 1556/7725 [03:16<10:30,  9.78it/s]

Could not process Aerosmith - Livin' on the Edge.gp3: 255 is not a valid NoteType


 21%|██        | 1584/7725 [03:19<09:19, 10.98it/s]

Could not process H_tk_znapi Csal_d_sok - Target.gp4: 


 25%|██▌       | 1954/7725 [04:06<11:42,  8.21it/s]

Could not process Hermetica - Olvidalo y volvera por mas.gp3: 255 is not a valid NoteType


 26%|██▌       | 1983/7725 [04:09<09:58,  9.60it/s]

Could not process Springsteen, Bruce - Further On (Up the Road).gp3: 255 is not a valid NoteType


 27%|██▋       | 2103/7725 [04:23<06:52, 13.63it/s]

Could not process ALMAFUERTE - TC.gp3: 255 is not a valid NoteType


 30%|███       | 2323/7725 [04:53<09:41,  9.29it/s]

Could not process Nahui Ollin - Solo Por Tu Amor.gp4: 


 32%|███▏      | 2507/7725 [05:17<14:18,  6.08it/s]

Could not process Audioslave - Cochise (2).gp3: 


 36%|███▌      | 2786/7725 [05:53<09:37,  8.55it/s]

Could not process Scorpions - Rock You Like a Hurricane.gp3: 255 is not a valid NoteType


 36%|███▌      | 2800/7725 [05:55<10:26,  7.86it/s]

Could not process Metallica - Whiplash (2).gp3: 255 is not a valid NoteType


 37%|███▋      | 2839/7725 [06:00<08:56,  9.11it/s]

Could not process Metallica - The Outlaw Torn.gp3: 255 is not a valid NoteType


 38%|███▊      | 2906/7725 [06:08<07:54, 10.15it/s]

Could not process Pixies - Velouria.gp3: 255 is not a valid NoteType


 39%|███▊      | 2991/7725 [06:19<09:29,  8.31it/s]

Could not process Guns N' Roses - Sweet Child O_'Mine.gp3: 255 is not a valid NoteType


 42%|████▏     | 3225/7725 [06:47<05:33, 13.51it/s]

Could not process Garbage - I Think I'm Paranoid.gp3: 255 is not a valid BeatStatus


 43%|████▎     | 3357/7725 [07:05<07:02, 10.34it/s]

Could not process Muse - Plug in baby.gp3: 255 is not a valid NoteType


 44%|████▍     | 3395/7725 [07:09<08:10,  8.83it/s]

Could not process Iron Maiden - Hallowed Be Thy Name.gp3: 


 45%|████▍     | 3473/7725 [07:18<09:22,  7.55it/s]

Could not process Metallica - Jump In The Fire.gp3: 255 is not a valid NoteType


 46%|████▌     | 3515/7725 [07:24<06:31, 10.77it/s]

Could not process Mor Ve Otesi - Cambaz.gp3: 255 is not a valid NoteType


 47%|████▋     | 3602/7725 [07:35<07:44,  8.88it/s]

Could not process Guns N' Roses - Knockin_' on Heaven_'s Door (3).gp3: 255 is not a valid NoteType


 47%|████▋     | 3617/7725 [07:38<09:49,  6.97it/s]

Could not process Nada Surf - Icebox.gp3: 255 is not a valid NoteType


 49%|████▉     | 3811/7725 [08:01<07:47,  8.37it/s]

Could not process Metallica - The Outlaw Torn (2).gp3: 255 is not a valid NoteType


 51%|█████     | 3930/7725 [08:16<07:30,  8.43it/s]

Could not process Blur - Coffee & TV.gp3: 255 is not a valid NoteType


 52%|█████▏    | 4022/7725 [08:27<04:36, 13.38it/s]

Could not process Pink Floyd - What do you want from me.gp3: 255 is not a valid NoteType


 54%|█████▍    | 4163/7725 [08:45<05:28, 10.84it/s]

Could not process Pixies - Cecilia Ann.gp3: 255 is not a valid SlapEffect


 57%|█████▋    | 4403/7725 [09:16<07:26,  7.44it/s]

Could not process Fredheim, Rolf - The Scarecrow.gp3: 255 is not a valid NoteType


 57%|█████▋    | 4408/7725 [09:16<05:13, 10.58it/s]

Could not process Guns N' Roses - Mr. Brownstone.gp3: negative shift count


 60%|██████    | 4635/7725 [09:46<06:57,  7.40it/s]

Could not process Funeral For A Friend - The End Of Nothing.gp4: 


 61%|██████    | 4702/7725 [09:54<06:49,  7.37it/s]

Could not process Dream Theater - Lie (3).gp3: 


 61%|██████    | 4720/7725 [09:57<06:36,  7.59it/s]

Could not process Disturbed - Meaning Of Life.gp4: 


 62%|██████▏   | 4820/7725 [10:09<04:11, 11.57it/s]

Could not process Nahui Ollin - Contigo.gp4: 


 63%|██████▎   | 4895/7725 [10:20<05:29,  8.59it/s]

Could not process Mudvayne - Not Falling (3).gp3: 


 64%|██████▍   | 4978/7725 [10:30<03:40, 12.46it/s]

Could not process Fear Factory - Body Hammer.gp3: 255 is not a valid NoteType


 69%|██████▊   | 5300/7725 [11:12<03:09, 12.77it/s]

Could not process ALMAFUERTE - En Este Viaje.gp3: 255 is not a valid NoteType


 69%|██████▉   | 5356/7725 [11:19<03:35, 10.99it/s]

Could not process Twisted Machine - Nobody Give You Face.gp3: 255 is not a valid NoteType


 74%|███████▍  | 5711/7725 [12:02<04:23,  7.63it/s]

Could not process Jym - SnakeByte.gp3: unpack requires a bytes object of length 4


 74%|███████▍  | 5730/7725 [12:05<04:20,  7.65it/s]

Could not process Foo Fighters - Learn To Fly.gp3: 32 is not a valid BeatStatus


 75%|███████▌  | 5794/7725 [12:12<04:00,  8.03it/s]

Could not process Yoon Do-hyun Band - back ha sa tang.gp3: 255 is not a valid NoteType


 77%|███████▋  | 5958/7725 [12:32<02:33, 11.52it/s]

Could not process Rhapsody - Triumph For My Magic Steel.gp3: 255 is not a valid NoteType


 77%|███████▋  | 5977/7725 [12:35<03:31,  8.25it/s]

Could not process ACDC - You Shook Me All Night Long (2).gp3: 255 is not a valid NoteType


 78%|███████▊  | 6028/7725 [12:41<02:39, 10.65it/s]

Could not process Flower - Please.gp3: 255 is not a valid NoteType


 78%|███████▊  | 6030/7725 [12:41<02:38, 10.70it/s]

Could not process Marilyn Manson - The Love Song (2).gp3: 255 is not a valid NoteType


 78%|███████▊  | 6052/7725 [12:44<02:40, 10.45it/s]

Could not process Guns N' Roses - Chinese Democracy (Live In Rio 2001).gp3: 119 is not a valid NoteType


 81%|████████▏ | 6283/7725 [13:14<03:20,  7.18it/s]

Could not process H_tk_znapi Csal_d_sok - Meztelen_l R_szegen.gp4: 


 82%|████████▏ | 6332/7725 [13:20<03:15,  7.11it/s]

Could not process Slipknot - Wait And Bleed (8).gp3: 


 85%|████████▍ | 6546/7725 [13:46<01:59,  9.83it/s]

Could not process Pantera - Fucking Hostile (2).gp3: unpack requires a bytes object of length 4


 85%|████████▌ | 6588/7725 [13:50<01:27, 12.93it/s]

Could not process No Doubt - Hella Good.gp3: 255 is not a valid NoteType


 86%|████████▋ | 6663/7725 [14:00<02:14,  7.89it/s]

Could not process Led Zeppelin - Houses of the holy.gp3: 255 is not a valid NoteType


 88%|████████▊ | 6778/7725 [14:15<01:32, 10.19it/s]

Could not process Creed - Higher (2).gp3: 


 89%|████████▊ | 6851/7725 [14:25<01:43,  8.47it/s]

Could not process System Of A Down - Chop Suey (7).gp3: 


 90%|████████▉ | 6914/7725 [14:33<01:36,  8.42it/s]

Could not process ACDC - Back In Black (4).gp3: 255 is not a valid NoteType


 91%|█████████▏| 7068/7725 [14:53<00:54, 11.96it/s]

Could not process Vai, Steve - Tender Surrender (2).gp3: 255 is not a valid NoteType


 93%|█████████▎| 7157/7725 [15:04<01:15,  7.52it/s]

Could not process Metallica - Outlaw Torn (Outro).gp3: 255 is not a valid NoteType


 95%|█████████▌| 7343/7725 [15:27<00:47,  8.07it/s]

Could not process Guns N' Roses - Bad Apples.gp3: 255 is not a valid NoteType


 95%|█████████▌| 7350/7725 [15:28<00:41,  8.93it/s]

Could not process Three Days Grace - Home (2).gp4: 


100%|██████████| 7725/7725 [16:13<00:00,  7.93it/s]


In [238]:
len(corrupted_list)

64

## Removing corrupted entries