# **dataset annotation**

In [None]:
!pip install jams
!pip install numpy==1.23.5
!pip install --upgrade jams

annotation_firstdraft.csv

In [None]:
import os
import jams
import pandas as pd

In [None]:
def midi_to_fret(midi_note):
    tuning = [64, 59, 55, 50, 45, 40]
    for string in range(6):
        fret = midi_note - tuning[string]
        if 0 <= fret <= 24:
            return (string + 1, fret)
    return (None, None)

output_dir = 'annotationcsv'
os.makedirs(output_dir, exist_ok=True)

In [None]:
input_folder = '/content/annotation(jams)'

for filename in os.listdir(input_folder):
    if filename.endswith('.jams'):
        jam_path = os.path.join(input_folder, filename)
        jam = jams.load(jam_path)
        notes = jam.search(namespace='note_midi')

        data = []
        for annotation in notes:
            for note in annotation.data:
                if isinstance(note.value, dict):
                    string = note.value.get('string')
                    fret = note.value.get('fret')
                    midi_pitch = None
                else:
                    midi_pitch = note.value
                    string, fret = midi_to_fret(midi_pitch)
                if string is not None and fret is not None:
                    entry = {
                        'start_time': note.time,
                        'end_time': note.time + note.duration,
                        'string': string,
                        'fret': fret,
                        'duration': note.duration,
                        'midi_pitch': midi_pitch
                    }
                    data.append(entry)
        if data:
            df = pd.DataFrame(data)
            csv_filename = os.path.splitext(filename)[0] + '.csv'
            csv_path = os.path.join(output_dir, csv_filename)
            df.to_csv(csv_path, index=False)
            print(f"Saved: {csv_path}")
        else:
            print(f"No note data in: {filename}")

In [None]:
import shutil
from google.colab import files

shutil.make_archive("annotationcsv", 'zip', "annotationcsv")

check labels

In [None]:
import zipfile
import pandas as pd
import os
from google.colab import files

uploaded = '/content/annotationcsv.zip'
extract_dir = "/content/annotationextracted"
with zipfile.ZipFile(uploaded, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
import glob

expected_columns = {'start_time', 'end_time', 'string', 'fret', 'duration', 'midi_pitch'}
csv_files = glob.glob(os.path.join(extract_dir, "**", "*.csv"), recursive=True)

correct_count = 0
incorrect_count = 0

print("checking :\n")

for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        cols = set(df.columns)

        if cols == expected_columns:
            print(f"{os.path.basename(csv_file)}: True")
            correct_count += 1
        else:
            missing = expected_columns - cols
            extra = cols - expected_columns
            print(f"{os.path.basename(csv_file)}:")
            if missing:
                print(f"   - missing_column: {missing}")
            if extra:
                print(f"   - extra_column: {extra}")
            incorrect_count += 1
    except Exception as e:
        print(f"{os.path.basename(csv_file)}: errornotfound ({e})")
        incorrect_count += 1

total_files = correct_count + incorrect_count
print(f" - correct: {correct_count}")
print(f" - missing: {incorrect_count}")
print(f" - total: {total_files}")

map chords

In [None]:
import pandas as pd
import music21
from music21 import pitch
from tqdm import tqdm
import zipfile, os, glob
from google.colab import files

In [None]:
zip_path = '/content/annotationcsv.zip'
extract_dir = "/content/annotationextracted"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
base_midi = {
    6: 40, 5: 45, 4: 50, 3: 55, 2: 59, 1: 64
}

In [None]:
bin_size = 2.0
csv_files = glob.glob(os.path.join(extract_dir, "**", "*.csv"), recursive=True)

In [None]:
output_dir = "/content/chordoutput"
os.makedirs(output_dir, exist_ok=True)

In [None]:
def predict_chords_from_csv(csv_path):
    try:
        df = pd.read_csv(csv_path)
        df = df.dropna(subset=['start_time', 'end_time', 'string', 'fret'])
        df['start_time'] = df['start_time'].astype(float)
        df['end_time'] = df['end_time'].astype(float)
        df['string'] = df['string'].astype(int)
        df['fret'] = df['fret'].astype(int)
        df['time_bin'] = df['start_time'].apply(lambda t: int(t // bin_size))

        chord_segments = []

        for time_bin, group in df.groupby('time_bin'):
            midi_pitches = []
            for _, row in group.iterrows():
                string = row['string']
                fret = row['fret']
                if string in base_midi:
                    midi = base_midi[string] + fret
                    midi_pitches.append(midi)

            try:
                note_names = [pitch.Pitch(m).getEnharmonic().name for m in midi_pitches]
                note_names = sorted(set(note_names))
            except:
                note_names = []

            if len(note_names) < 3 or len(note_names) > 6:
                chord_name = "Unknown"
            else:
                try:
                    chord_obj = music21.chord.Chord(note_names)
                    chord_symbol = music21.harmony.chordSymbolFromChord(chord_obj)
                    chord_name = chord_symbol.figure
                    if "Cannot Be Identified" in chord_name:
                        raise Exception("Fallback")
                except:
                    try:
                        root = chord_obj.root().name
                        quality = chord_obj.quality
                        if quality == 'major':
                            chord_name = root
                        elif quality == 'minor':
                            chord_name = root + 'm'
                        elif quality == 'diminished':
                            chord_name = root + 'dim'
                        elif quality == 'augmented':
                            chord_name = root + 'aug'
                        elif quality == 'dominant':
                            chord_name = root + '7'
                        else:
                            chord_name = root + '?'
                    except:
                        chord_name = "Unknown"

            chord_segments.append({
                "start_time": time_bin * bin_size,
                "end_time": (time_bin + 1) * bin_size,
                "chord": chord_name
            })

        return pd.DataFrame(chord_segments)
    except Exception as e:
        print(f"Error in {csv_path}: {e}")
        return None

In [None]:
for file in csv_files:
    print(f"Predicting chords in: {os.path.basename(file)}")
    chord_df = predict_chords_from_csv(file)
    if chord_df is not None:
        output_path = os.path.join(output_dir, os.path.basename(file).replace(".csv", "_chords.csv"))
        chord_df.to_csv(output_path, index=False)
        print(f"Saved to: {output_path}\n")
    else:
        print(f"Failed to process {file}\n")

clean

In [None]:
import os
import glob
import pandas as pd
import re

In [None]:
input_folder = "/content/chordoutput"
output_folder = "/content/cleanoutput"

os.makedirs(output_folder, exist_ok=True)

In [None]:
def clean_chord_name(raw_chord):
    if not isinstance(raw_chord, str) or raw_chord.strip().lower() in ['unknown', '', '?']:
        return 'Unknown'

    chord = raw_chord.replace('♯', '#').replace('♭', 'b')
    chord = chord.split('/')[0]
    chord = re.sub(r'\(.*?\)', '', chord)
    chord = re.sub(r'[^A-Ga-g#bmM\d]', '', chord)
    chord = chord.replace('min', 'm').replace('maj', 'maj').replace('M', '')
    chord = chord.replace('ma7', 'maj7').replace('ma9', 'maj9')
    chord = chord.replace("eadd", "add")
    chord = re.sub(r"add[A-G#b]+", "", chord)
    chord = re.sub(r"[^A-G#bmaugdinsj79]+", "", chord)
    chord = chord.replace("2", "sus2").replace("11", "add11")

    if re.match(r"[A-G][b#]?ma7", chord):
        chord = chord.replace("ma7", "maj7")
    if len(chord) > 5 and not re.match(r"[A-G][#b]?m?(aj|in|aug|dim)?\d*", chord):
        chord = chord[:2]
    enharmonic_map = {
        'B#': 'C', 'E#': 'F', 'Cb': 'B', 'Fb': 'E',
        'G#': 'Ab', 'D#': 'Eb', 'A#': 'Bb',
        'F##': 'G', 'C##': 'D', 'G--': 'F', 'D--': 'C', 'A--': 'G'
    }

    chord_root_match = re.match(r'^([A-G][#b]?)(.*)', chord)
    if chord_root_match:
        root = chord_root_match.group(1)
        suffix = chord_root_match.group(2)
        root_clean = enharmonic_map.get(root, root)
        final = root_clean + suffix
    else:
        final = 'Unknown'

    return final

In [None]:
csv_files = glob.glob(os.path.join(input_folder, "*.csv"))
print(f"file : {len(csv_files)} files")

for file_path in csv_files:
    try:
        df = pd.read_csv(file_path)
        df['clean_chord'] = df['chord'].apply(clean_chord_name)

        output_file = os.path.basename(file_path)
        output_path = os.path.join(output_folder, output_file)
        df.to_csv(output_path, index=False)
        print(f"cleaned: {output_file}")
    except Exception as e:
        print(f"error in {file_path}: {e}")

In [None]:
import shutil
shutil.make_archive("cleanoutput", 'zip', "cleanoutput")

check real chords

In [None]:
import pandas as pd
import os
import glob

In [None]:
input_folder = "/content/cleanoutput"
output_folder = "/content/chord_playability_analysis"
os.makedirs(output_folder, exist_ok=True)

In [None]:
common_guitar_chords = {
    'A', 'Am', 'A7', 'Am7', 'Asus2', 'Asus4',
    'B', 'Bm', 'B7', 'Bm7',
    'C', 'Cm', 'C7', 'Cm7', 'Cmaj7',
    'D', 'Dm', 'D7', 'Dm7', 'Dsus2', 'Dsus4',
    'E', 'Em', 'E7', 'Em7',
    'F', 'Fm', 'F7', 'Fmaj7',
    'G', 'Gm', 'G7', 'Gmaj7',
    'Bb', 'Bbm', 'Eb', 'Ebm', 'Ab', 'Abm',
    'F#', 'F#m', 'C#', 'C#m', 'G#', 'G#m',
    'D#', 'D#m', 'Db', 'Dbm',
    'Unknown'
}

In [None]:
all_playable = set()
all_unplayable = set()
summary = []

In [None]:
csv_files = glob.glob(os.path.join(input_folder, "*.csv"))
print(f"Files : {len(csv_files)}")

In [None]:
for file in csv_files:
    df = pd.read_csv(file)

    valid_chords = df[df['clean_chord'] != 'Unknown']['clean_chord'].unique()

    playable = set([c for c in valid_chords if c in common_guitar_chords])
    unplayable = set([c for c in valid_chords if c not in common_guitar_chords])

    all_playable.update(playable)
    all_unplayable.update(unplayable)

    summary.append({
        'filename': os.path.basename(file),
        'total_segments': len(df),
        'unique_chords': len(valid_chords),
        'playable_chords': len(playable),
        'unplayable_chords': len(unplayable),
        'percent_playable': round(100 * len(playable) / len(valid_chords), 2) if len(valid_chords) > 0 else 0
    })

In [None]:
summary_df = pd.DataFrame(summary)
summary_df.to_csv(os.path.join(output_folder, "chord_playability_summary.csv"), index=False)

pd.DataFrame(sorted(all_playable), columns=["Playable_Chords"]).to_csv(
    os.path.join(output_folder, "playable_chords.csv"), index=False)

pd.DataFrame(sorted(all_unplayable), columns=["Unplayable_Chords"]).to_csv(
    os.path.join(output_folder, "unplayable_chords.csv"), index=False)

print("Done! :", output_folder)

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('chord_playability_analysis/chord_playability_summary.csv')
print(f"totaldata: {len(df)}")
print(df.head())

create spectrogram dataset

In [None]:
!pip install tensorflow==2.12 numpy==1.24.3 --force-reinstall
import os
os.kill(os.getpid(), 9)

In [None]:
import os
import librosa
import librosa.display
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
import zipfile
from google.colab import files

uploaded = '/content/cleanoutput.zip'
extract_dir = "/content/annotationcsv"
with zipfile.ZipFile(uploaded, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

วนทำสี่ไฟล์zipที่มี



*  audio_hex-pickup_original.zip
*  audio_hex-pickup_debleeded.zip
*  audio_mono-mic.zip
*  audio_mono-pickup_mix.zip

In [None]:
!pip install -q gdown
!gdown --id [1k9xL4zA-9QV3z3K2QdZPxP0hn4OoUkBU]

Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=[1k9xL4zA-9QV3z3K2QdZPxP0hn4OoUkBU]

but Gdown can't. Please check connections and permissions.


In [None]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/audio_hex-pickup_debleeded.zip'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
from google.colab import files

uploaded = '/content/drive/MyDrive/cleanoutput.zip'
extract_dir = "annotation"
with zipfile.ZipFile(uploaded, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
def rename_files_remove_hex(folder_path):
    for filename in os.listdir(folder_path):
        if '_mix' in filename:
            new_filename = filename.replace('_mix', '')
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, new_filename)
            os.rename(old_path, new_path)
            print(f"Renamed: {filename} → {new_filename}")
        if '_mic' in filename:
            new_filename = filename.replace('_mic', '')
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, new_filename)
            os.rename(old_path, new_path)
            print(f"Renamed: {filename} → {new_filename}")
        if '_cln' in filename:
            new_filename = filename.replace('_cln', '')
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, new_filename)
            os.rename(old_path, new_path)
            print(f"Renamed: {filename} → {new_filename}")
        if '_hex' in filename:
            new_filename = filename.replace('_hex', '')
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, new_filename)
            os.rename(old_path, new_path)
            print(f"Renamed: {filename} → {new_filename}")
    print("Rename complete.")

In [None]:
rename_files_remove_hex("/content/audiotrack")

Renamed: 03_Rock3-117-Bb_solo_hex.wav → 03_Rock3-117-Bb_solo.wav
Renamed: 04_SS1-68-E_solo_hex.wav → 04_SS1-68-E_solo.wav
Renamed: 05_Jazz3-137-Eb_comp_hex.wav → 05_Jazz3-137-Eb_comp.wav
Renamed: 03_Jazz2-110-Bb_comp_hex.wav → 03_Jazz2-110-Bb_comp.wav
Renamed: 04_BN3-119-G_solo_hex.wav → 04_BN3-119-G_solo.wav
Renamed: 00_SS1-68-E_comp_hex.wav → 00_SS1-68-E_comp.wav
Renamed: 02_SS1-68-E_comp_hex.wav → 02_SS1-68-E_comp.wav
Renamed: 02_Funk1-114-Ab_comp_hex.wav → 02_Funk1-114-Ab_comp.wav
Renamed: 01_Rock3-117-Bb_comp_hex.wav → 01_Rock3-117-Bb_comp.wav
Renamed: 01_Funk1-114-Ab_comp_hex.wav → 01_Funk1-114-Ab_comp.wav
Renamed: 03_Funk3-112-C#_solo_hex.wav → 03_Funk3-112-C#_solo.wav
Renamed: 05_BN2-131-B_comp_hex.wav → 05_BN2-131-B_comp.wav
Renamed: 05_Jazz2-110-Bb_comp_hex.wav → 05_Jazz2-110-Bb_comp.wav
Renamed: 03_Funk3-98-A_comp_hex.wav → 03_Funk3-98-A_comp.wav
Renamed: 02_SS2-88-F_comp_hex.wav → 02_SS2-88-F_comp.wav
Renamed: 03_BN3-119-G_comp_hex.wav → 03_BN3-119-G_comp.wav
Renamed: 01_Fu

In [None]:
import os

folder_path = '/content/spectrogram_final2'
count = 0

for root, dirs, files in os.walk(folder_path):
    count += len(files)

print(f"จำนวนไฟล์ทั้งหมด: {count}")

จำนวนไฟล์ทั้งหมด: 4034


spectrogram

In [None]:
def create_complete_chord_spectrogram_dataset(audio_dir, annotation_dir, output_dir, sr=22050, min_duration=0.8):
    os.makedirs(output_dir, exist_ok=True)
    audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]

    all_chords_set = set()

    for audio_file in audio_files:
        base_name = os.path.splitext(audio_file)[0]
        wav_path = os.path.join(audio_dir, audio_file)

        annotation_candidates = [f for f in os.listdir(annotation_dir) if f.endswith('.csv')]
        matched_file = next((f for f in annotation_candidates if base_name in f), None)

        if not matched_file:
            print(f"Annotation not found for {audio_file}, skipping.")
            continue

        annotation_path = os.path.join(annotation_dir, matched_file)

        try:
            y, _ = librosa.load(wav_path, sr=sr)
            df = pd.read_csv(annotation_path)
        except Exception as e:
            print(f"Error loading {audio_file}: {e}")
            continue

        for i, row in df.iterrows():
            start = row['start_time']
            end = row['end_time']
            chord = row['clean_chord']

            if chord == 'Unknown' or not isinstance(chord, str) or chord.strip() == "":
                continue

            all_chords_set.add(chord)

            start_sample = int(start * sr)
            end_sample = int(end * sr)
            segment = y[start_sample:end_sample]

            if len(segment) < int(min_duration * sr):
                continue

            try:
                S = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=128)
                S_dB = librosa.power_to_db(S, ref=np.max)

                chord_folder = os.path.join(output_dir, chord)
                os.makedirs(chord_folder, exist_ok=True)

                fig = plt.figure(figsize=(5, 4))
                librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
                plt.axis('off')

                save_path = os.path.join(chord_folder, f"{base_name}_{i}.png")
                plt.savefig(save_path, bbox_inches='tight', pad_inches=0, dpi=150)
                plt.close(fig)
            except Exception as e:
                print(f"Failed to process segment {i} in {audio_file}: {e}")

    print(f"\nTotal unique chords found: {len(all_chords_set)}")
    print(sorted(all_chords_set))

    print("\nSpectrogram dataset created.")

In [None]:
create_complete_chord_spectrogram_dataset(
    audio_dir= '/content/audiotrack',
    annotation_dir='/content/annotation',
    output_dir='/content/spectrogram_final2'
)


Total unique chords found: 26
['A', 'Am', 'B', 'B9', 'Bb', 'Bmaj7', 'C', 'C##', 'Cm', 'Cm7', 'Cm9', 'D', 'D7', 'Dm', 'Dm7', 'Eb', 'Ebmaj7', 'F', 'F##', 'F##m', 'G', 'G7', 'Gag', 'Gm', 'Gm7', 'Gm9']

Spectrogram dataset created.


fix chords

In [None]:
import shutil
import os
source_folder = '/content/spectrogram_final2/C##'
destination_folder = '/content/spectrogram_final2/D'

for filename in os.listdir(source_folder):
        source_path = os.path.join(source_folder, filename)
        destination_path = os.path.join(destination_folder, filename)
        if os.path.isfile(source_path):
            shutil.move(source_path, destination_path)
print("complete")

complete


In [None]:
import shutil
from google.colab import files

shutil.make_archive("spectrogram_final2", 'zip', "spectrogram_final2")

'/content/spectrogram_final2.zip'

In [None]:
from google.colab import files
files.download('spectrogram_final2.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>