In [14]:
# Same header that we use in colab to set up the environment
import os
import random
import csv
from miditok import REMI, Octuple
from miditok.data_augmentation import augment_midi_dataset
from pathlib import Path
import subprocess

try:
  import masters_environment
except:
  print("No local environment loaded")

No local environment loaded


In [15]:
import pandas as pd

In [52]:
def midi_to_abc(midi_file_path, abc_file_path):
    try:
        subprocess.run(["midi2abc", midi_file_path, "-o", abc_file_path], check=True)
        # print(f"Successfully converted {midi_file_path} to {abc_file_path}")
    except subprocess.CalledProcessError as e:
        pass
        #print(f"Error occurred during conversion: {e}")

In [53]:
def convert_midi_to_abc(source_folder, target_folder):
    """
    Recursively convert all MIDI files found in source_folder and its subfolders
    to ABC format, ensuring that musical content is correctly written.
    """
    source_path = Path(source_folder)
    target_path = Path(target_folder)
    
    # Ensure the target folder exists
    target_path.mkdir(parents=True, exist_ok=True)
    
    # Walk through the source folder and find MIDI files
    for midi_path in source_path.rglob('*.mid'):
        # Construct the relative path for the target file
        relative_path = midi_path.relative_to(source_path)
        abc_filename = relative_path.with_suffix('.abc')
        full_target_path = target_path / abc_filename
        
        # Ensure the target directory exists
        full_target_path.parent.mkdir(parents=True, exist_ok=True)
        
        midi_to_abc(midi_path, full_target_path)
        #print(f"Converted and saved: {full_target_path}")

In [54]:
catelog = pd.read_csv("../_csv_data/koechel_catelog.csv")
mozart_midi_root = '../MusicLLM/data/Mozart'
mozart_midi_transposed_root = '../MusicLLM/data_transposed/Mozart'
mozart_abc_root = '../MusicLLM/data_abc/Mozart'

In [55]:
#tokenizer = REMI()  # using defaults parameters (constants.py)
tokenizer = Octuple()
data_path = Path(mozart_midi_root)

In [56]:
data_path

PosixPath('../MusicLLM/data/Mozart')

In [57]:
!pwd

/Users/petergreis/Library/CloudStorage/Dropbox/Leeds/Project/_setup_notebooks


In [58]:
# Performs data augmentation on pitch, velocities and durations; save MIDI files
midi_aug_path = Path(mozart_midi_transposed_root)

augment_midi_dataset(
    data_path,
    pitch_offsets=[-5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6],
    velocity_offsets=[],
    duration_offsets=[],
    out_path=midi_aug_path,
    copy_original_in_new_location=True,
    save_data_aug_report=True
)

Performing data augmentation: 100%|███████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 183.89it/s]


In [59]:
tokenizer.tokenize_midi_dataset(
    data_path,
    Path("../tokenized_test")
)

Tokenizing MIDIs (../tokenized_test): 100%|████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 11.83it/s]


In [60]:
convert_midi_to_abc(mozart_midi_transposed_root, mozart_abc_root)

Error: Time=20160 Track=10 Note terminated when not on - pitch 43
Error: Time=119040 Track=10 Note terminated when not on - pitch 43
Error: Time=256320 Track=10 Note terminated when not on - pitch 43
Error: Time=20160 Track=11 Note terminated when not on - pitch 43
Error: Time=119040 Track=11 Note terminated when not on - pitch 43
Error: Time=256320 Track=11 Note terminated when not on - pitch 43
Error: Time=440836 Track=3 Note terminated when not on - pitch 72
Error: Time=442372 Track=3 Note terminated when not on - pitch 74
Error: Time=4320 Track=1 Note terminated when not on - pitch 0
Error: Time=4440 Track=1 Note terminated when not on - pitch 0
Error: Time=10080 Track=1 Note terminated when not on - pitch 0
Error: Time=11040 Track=1 Note terminated when not on - pitch 0
Error: Time=11040 Track=1 Note terminated when not on - pitch 0
Error: Time=12000 Track=1 Note terminated when not on - pitch 0
Error: Time=20160 Track=1 Note terminated when not on - pitch 0
Error: Time=21120 Trac

MIDI file has no notes!
MIDI file has no notes!
MIDI file has no notes!


Error: Time=46440 Track=10 Note terminated when not on - pitch 38
Error: Time=46560 Track=10 Note terminated when not on - pitch 38
Error: Time=73732 Track=1 Note terminated when not on - pitch 81
Error: Time=168196 Track=1 Note terminated when not on - pitch 74
Error: Time=18240 Track=10 Note terminated when not on - pitch 62
Error: Time=18240 Track=10 Note terminated when not on - pitch 62
Error: Time=18240 Track=10 Note terminated when not on - pitch 62
Error: Time=18240 Track=10 Note terminated when not on - pitch 62
Error: Time=18240 Track=10 Note terminated when not on - pitch 62
Error: Time=18240 Track=10 Note terminated when not on - pitch 62
Error: Time=18240 Track=10 Note terminated when not on - pitch 62
Error: Time=18240 Track=10 Note terminated when not on - pitch 62
Error: Time=18263 Track=10 Note terminated when not on - pitch 62
Error: Time=18263 Track=10 Note terminated when not on - pitch 62
Error: Time=18263 Track=10 Note terminated when not on - pitch 62
Error: Time

MIDI file has no notes!


In [61]:
def get_files_without_hash(directory):
    files = [file for file in os.listdir(directory) if not "#" in file]
    return files

def get_all_files(directory):
    files = [file for file in os.listdir(directory)]
    return files

def get_abc_data(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    
    # Remove line endings that match "\n"
    modified_text = text.replace("\\\n", "")
    
    return modified_text

def get_random_query():
    queries = ["Craft a musical arrangement echoing the elegance of Mozart.",
    "Develop a piece capturing the essence of Mozart's musical language.",
    "Construct a composition reminiscent of Mozart's classical style.",
    "Formulate a musical creation inspired by the genius of Mozart.",
    "Fashion a piece evoking the spirit of Mozart's compositions.",
    "Produce a composition in homage to Mozart's timeless style.",
    "Invent a musical work reflecting Mozart's signature elegance.",
    "Design a composition modeled after the classical tradition of Mozart.",
    "Conceive a piece that pays tribute to Mozart's musical legacy.",
    "Shape a composition in the vein of Mozart's classical masterpieces.",
    "Devise a musical arrangement embodying Mozartian grace and charm.",
    "Create a piece inspired by the classical structures favored by Mozart.",
    "Construct a composition resonating with the melodic brilliance of Mozart.",
    "Develop a musical work in the esteemed tradition of Mozart's compositions.",
    "Formulate a composition infused with the harmonic richness of Mozart's style.",
    "Craft a piece that captures the sophistication and refinement of Mozart's music.",
    "Produce a composition that echoes the grace and poise characteristic of Mozart.",
    "Design a musical arrangement paying homage to Mozart's classical genius.",
    "Invent a piece inspired by the timeless melodies of Mozart.",
    "Shape a composition that reflects the enduring legacy of Mozart's musical craftsmanship."]

    q = 'Human: ' + random.choice(queries) + '</s>'
    
    return q

In [62]:
print(get_random_query())

Human: Devise a musical arrangement embodying Mozartian grace and charm.</s>


In [68]:
mozart_files = get_files_without_hash(mozart_abc_root)
#mozart_files = get_all_files(mozart_abc_root)

In [69]:
has_title = 0
no_title = 0
missing = []
column_names = ['instruction', 'input', 'output', 'src']

empty_df = pd.DataFrame(columns=column_names)
empty_df

Unnamed: 0,instruction,input,output,src


In [70]:
created = 0

for n in mozart_files:
    # Just filename
    title = ''
    k = n.split('.')[0]
    matching_rows = catelog[catelog['K6'] == k]

    if len(matching_rows) > 0: # we have a title
        title = str(matching_rows.iloc[0]['Composition']).strip()
        if len(title) > 0:
            title = title + "</s>"
            has_title += 1
        else:
            no_title += 1
    else:
        title = ''
        no_title += 1
        missing.append(k)

    data = get_abc_data(os.path.join('../MusicLLM/data_abc/Mozart', n))
                 
    # Need: instruction, input,       output,            src
    #       Human:       tite: </s>  Assistant: </s>
    if len(data.strip()) > 0:
        new_instruction = get_random_query()
        new_input = ''
        new_output = "Assistant: " + data  + "</s>"
        # new_src = 'distilled'
        new_src = "https://www.kunstderfuge.com/mozart.htm"
    
        data_to_append = pd.DataFrame([[new_instruction, new_input, new_output, new_src]], columns=column_names)
        empty_df = pd.concat([empty_df, data_to_append], ignore_index=True)
        created += 1
    else:
        print("Skipping file: ", n)

print("Match: ", has_title, " no match: ", no_title)
print("Files created: ", created)

Skipping file:  581a_3_clarinet_quintet_(c)bakels.abc
Skipping file:  581a_4_clarinet_quintet_(c)bakels.abc
Skipping file:  581a_2_clarinet_quintet_(c)bakels.abc
Skipping file:  81a_1_clarinet_quintet_5(c)bakels.abc
Match:  11  no match:  150
Files created:  157


In [71]:
empty_df.iloc[0]

instruction    Human: Shape a composition that reflects the e...
input                                                           
output         Assistant: X: 1\nT: from ../MusicLLM/data_tran...
src                      https://www.kunstderfuge.com/mozart.htm
Name: 0, dtype: object

In [72]:
#empty_df.to_csv('mozart_source_transposed.csv', mode='w', quoting=csv.QUOTE_ALL, index=False)
empty_df.to_csv('mozart_source.csv', quoting=csv.QUOTE_ALL, index=False) 

In [44]:
!pwd


/Users/petergreis/Library/CloudStorage/Dropbox/Leeds/Project/_setup_notebooks
