In [3]:

# Import necessary libraries
import numpy as np

import os

# Configurations (modify as needed)
class Config:
    validation_fraction = 0.2  # Fraction of data used for validation
    batch_size = 32  # Batch size

config = Config()

# Define data path
data_path = 'data/data_v2'
assert os.path.exists(data_path), "Data path does not exist!"

In [None]:

# Load data from the specified path
with open(data_path, 'r') as f:
    data = f.read()

# Split the data into tokens and initialize the token set
tokens_set = set(data.split())
start_symbol, end_symbol = '<s>', '</s>'
tokens_set.update({start_symbol, end_symbol})

# Create vocabulary mappings
idx2token = list(tokens_set)
vocab_size = len(idx2token)
print('Vocabulary size:', vocab_size)
token2idx = dict(izip(idx2token, range(vocab_size)))  # Use zip for Python 3


In [None]:

# Process the tunes and tokenize them
tunes = data.split('\n\n')
del data  # Free up memory

tunes = [
    [token2idx[c] for c in [start_symbol] + t.split() + [end_symbol]]
    for t in tunes
]

# Sort tunes by length (longest first)
tunes.sort(key=lambda x: len(x), reverse=True)
ntunes = len(tunes)
print('Number of tunes:', ntunes)


In [None]:
display(tunes[0])

In [None]:

# Calculate tune lengths
tune_lens = np.array([len(t) for t in tunes])
max_len = max(tune_lens)
print('Max tune length:', max_len)

# Calculate the number of validation tunes
nvalid_tunes = ntunes * config.validation_fraction
nvalid_tunes = config.batch_size * max(
    1, int(round(nvalid_tunes / float(config.batch_size)))
)  # Round to a multiple of batch_size
print('Number of validation tunes:', nvalid_tunes)

# Split into training and validation sets
rng = np.random.RandomState(42)  # Fixed seed for reproducibility
valid_idxs = rng.choice(np.arange(ntunes), int(nvalid_tunes), replace=False)

# Create validation and training datasets
valid_tunes = [tunes[i] for i in valid_idxs]
train_tunes = [tunes[i] for i in range(ntunes) if i not in valid_idxs]

print('Training tunes:', len(train_tunes))
print('Validation tunes:', len(valid_tunes))


In [None]:
data_path = 'DutchFolkTunes/dataset.txt'
assert os.path.exists(data_path), "Data path does not exist!"


# Load data from the specified path
with open(data_path, 'r') as f:
    data = f.read()

# Split the data into tokens and initialize the token set
tokens_set = set(data.split())
start_symbol, end_symbol = '<s>', '</s>'
tokens_set.update({start_symbol, end_symbol})

# Create vocabulary mappings
idx2token = list(tokens_set)
vocab_size = len(idx2token)
print('Vocabulary size:', vocab_size)
token2idx = dict(izip(idx2token, range(vocab_size)))  # Use zip for Python 3

# Process the tunes and tokenize them
tunes = data.split('\n\n')
del data  # Free up memory

tunes = [
    [token2idx[c] for c in [start_symbol] + t.split() + [end_symbol]]
    for t in tunes
]

# Sort tunes by length (longest first)
tunes.sort(key=lambda x: len(x), reverse=True)
ntunes = len(tunes)
print('Number of tunes:', ntunes)


In [None]:
# Calculate tune lengths
tune_lens = np.array([len(t) for t in tunes])
max_len = max(tune_lens)
print('Max tune length:', max_len)

# Calculate the number of validation tunes
nvalid_tunes = ntunes * config.validation_fraction
nvalid_tunes = config.batch_size * max(
    1, int(round(nvalid_tunes / float(config.batch_size)))
)  # Round to a multiple of batch_size
print('Number of validation tunes:', nvalid_tunes)

# Split into training and validation sets
rng = np.random.RandomState(42)  # Fixed seed for reproducibility
valid_idxs = rng.choice(np.arange(ntunes), int(nvalid_tunes), replace=False)

# Create validation and training datasets
valid_tunes = [tunes[i] for i in valid_idxs]
train_tunes = [tunes[i] for i in range(ntunes) if i not in valid_idxs]

print('Training tunes:', len(train_tunes))
print('Validation tunes:', len(valid_tunes))


In [None]:

# Visualization imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Set up plot styles
sns.set(style="whitegrid")

# Prepare a pandas DataFrame for tune lengths
tune_stats = pd.DataFrame({
    'Tune Length': tune_lens,
    'Tune Index': range(len(tune_lens))
})

# Display basic statistics in a DataFrame
display(tune_stats.describe())

# Plot a histogram of tune lengths
plt.figure(figsize=(10, 6))
sns.distplot(tune_stats['Tune Length'], kde=True, bins=30, color='skyblue')
plt.title("Distribution of Tune Lengths", fontsize=16)
plt.xlabel("Tune Length", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.show()

# Plot a box plot for tune lengths
plt.figure(figsize=(8, 6))
sns.boxplot(x=tune_stats['Tune Length'], color='lightgreen')
plt.title("Spread of Tune Lengths", fontsize=16)
plt.xlabel("Tune Length", fontsize=12)
plt.show()

# Token frequency analysis
token_freq = pd.Series(data=0, index=idx2token)
for tune in tunes:
    for idx in tune:
        token_freq[idx2token[idx]] += 1

# Display top 10 tokens in a DataFrame
top_tokens = token_freq.sort_values(ascending=False).head(30)
display(pd.DataFrame(top_tokens, columns=['Frequency']))

# Bar chart of top 10 tokens
plt.figure(figsize=(10, 6))
top_tokens.plot(kind='bar', color='orange')
plt.title("Top 10 Tokens by Frequency", fontsize=16)
plt.xlabel("Token", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.xticks(rotation=45)
plt.show()


In [None]:
display(tune_stats.head(10))

### Shows what the output looks like


In [None]:
# Import necessary libraries
import pickle
import pandas as pd

# Specify the path to your .pkl file
pkl_file_path = 'metadata/folkrnn_v2.pkl'

# Load the pickle file
with open(pkl_file_path, 'rb') as file:
    data = pickle.load(file)

display(data)


In [19]:
import re

# Path to the input text file
input_file = "DutchFolkTunes\dataset.txt"  # Replace with your file path

# Read the content of the input file
with open(input_file, "r") as file:
    input_string = file.read()


# Regular expression to find patterns like [M ... ], [K ... ], or [L ... ]
pattern = r"(\[M.*?\])|(\[K.*?\])|(\[L.*?\])"

# Add a newline before the matched patterns
modified_string = re.sub(pattern, lambda m: 
    '\n' + (m.group(2) + '\n' if m.group(2) else m.group(1) if m.group(1) else m.group(3)), 
    input_string)

modified_string = re.sub(pattern, lambda m: (
    '\n' + (m.group(2) + '\n' if m.group(2) else  # [K ... ]: add newline after
    ('\n' + m.group(3) if m.group(3) else  # [L ... ]: add newline before
    m.group(1)))  # [M ... ]: no changes
), input_string)


# Path to the output text file
output_file = "DutchFolkTunes\output.txt"  # Replace with your desired output file path

# Write the modified content to the output file
with open(output_file, "w") as file:
    file.write(modified_string)



In [3]:
# Import required libraries
import re

# Function to process lines
def process_lines(input_file, output_file):
    with open(input_file, 'r') as file:
        lines = file.readlines()
    
    processed_lines = []
    
    for line in lines:
        # Use a regular expression to find two consecutive numbers (e.g., "23") and remove everything after
        match = re.search(r'\d{2,}', line)  # Matches any sequence of 2 or more digits
        if match:
            line = line[:match.start()]  # Keep only content before the match
        processed_lines.append(line.strip())
    
    # Save the processed lines to a new file
    with open(output_file, 'w') as file:
        for processed_line in processed_lines:
            file.write(processed_line + '\n')
    
    #print(f"Processed lines saved to {output_file}")

# Example usage
input_filename = "DutchFolkTunes\output.txt"   # Replace with your input file name
output_filename = "DutchFolkTunes\Finaloutput.txt" # Replace with your desired output file name
process_lines(input_filename, output_filename)


In [None]:
import os

# Define the input and output file paths
input_file = "DutchFolkTunes\output.txt"  # Replace with your input text file
output_file = r"DutchFolkTunes\testoutput.txt"  # Replace with your desired output path

# Ensure the directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Open the input file and process the lines
with open(input_file, "r") as file:
    lines = file.readlines()

# Create a list to store the filtered lines
filtered_lines = []

# Loop through the lines and apply the condition
for line in lines:
    # Check if the line starts with [M and does not end with ]
    if not (line.startswith("[M") and not line.rstrip().endswith("]")):
        filtered_lines.append(line)

# Write the filtered lines to the output file
with open(output_file, "w") as file:
    file.writelines(filtered_lines)

In [1]:
import os

# Define the input and output file paths
input_file = r"DutchFolkTunes\testoutput.txt"  # Replace with your input text file
output_file = r"DutchFolkTunes\finalv2.txt"  # Replace with your desired output path

# Open the input file and process the lines
with open(input_file, "r") as file:
    lines = file.readlines()

# Create a list to store valid samples
valid_samples = []

# Define the expected line structure
expected_format = [
    lambda line: line.startswith("[L") and line.endswith("]\n"),  # First line
    lambda line: line.startswith("[M") and line.endswith("]\n"),  # Second line
    lambda line: line.startswith("[K") and line.endswith("]\n"),  # Third line
    lambda line: True,  # Fourth line (any string)
    lambda line: line.strip() == "",  # Fifth line (empty line)
]

# Process the lines in groups of 5
sample = []
for line in lines:
    line = line.replace("[", "").replace("]", "")  # Remove all '[' and ']'
    sample.append(line)
    if len(sample) == 5:  # Check a complete sample
        if all(check_fn(sample[i]) for i, check_fn in enumerate(expected_format)):
            valid_samples.extend(sample)  # Add valid sample to the list
        sample = []  # Reset for the next sample

# Write the valid samples to the output file
with open(output_file, "w") as file:
    file.writelines(valid_samples)

In [1]:
import os

# Define the input and output file paths
input_file = r"DutchFolkTunes\OriginalDataset.txt"  # Replace with your input text file
output_file = r"DutchFolkTunes\OriginalDatasetNoBrackets.txt"  # Replace with your desired output text file

# Open the input file and process the lines
with open(input_file, "r") as file:
    lines = file.readlines()

# Create a list to store valid samples
valid_samples = []

# Process the lines in groups of 5
sample = []
for line in lines:
    line = line.replace("[", "").replace("]", "")  # Remove all '[' and ']'
    sample.append(line)
    if len(sample) == 5:  # Process a complete sample
        valid_samples.extend(sample)  # Add sample to the list
        sample = []  # Reset for the next sample

# Write the valid samples to the output file
with open(output_file, "w") as file:
    file.writelines(valid_samples)

In [23]:
with open('.\\metadata\\config5--20250109-102849.pkl', 'rb') as f:
    raw_data = f.read()
    print(raw_data[:100])  # View the first 100 bytes
    
with open('.\\metadata\\folkrnn_v2.pkl', 'rb') as f:
    raw_data = f.read()
    print(raw_data[:100])

b"\x80\x02}q\x01(U\ttoken2idxq\x02}q\x03(U\x02d'q\x04K\x00U\x05M:3/8q\x05K^U\x02=eq\x06K\x01U\x02=dq\x07K\x02U\x02=gq\x08K\x03U\x02=fq\tK\x04U\x02=aq\r\nK\x05U\x02=cq\x0bK\x06U\x02=bq\x0cK\x07U"
b"\x80\x02}q\x00(U\ttoken2idxq\x01}q\x02(U\x02d'q\x03K\x00U\x03=A,q\x04K\x01U\x03^c'q\x05K\x02U\x02=eq\x06K\x03U\x02=dq\x07K\x04U\x02=gq\x08K\x05U\x02=fq\tK\x06U\x02=aq\nK\x07U\x02=cq\x0bK\x08U\x02="


In [2]:
# Import necessary libraries
import re

# Define a function to clean the text file
def clean_text_file(input_file, output_file):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    cleaned_lines = []
    for i in range(len(lines)):
        # Check if the current line is a [K line
        if lines[i].startswith('[K:'):
            # Ensure the previous line is an [M line
            if i > 0 and lines[i - 1].startswith('[M:'):
                cleaned_lines.append(lines[i])  # Keep the [K line
                cleaned_lines.append(lines[i + 1])  # Keep the corresponding string line
        # Otherwise, keep non-[K lines
        elif not (i > 0 and lines[i - 1].startswith('[K:') and lines[i].strip()):
            cleaned_lines.append(lines[i])

    # Write the cleaned content back to a new file
    with open(output_file, 'w') as file:
        file.writelines(cleaned_lines)

# Specify the input and output file paths
input_file = './DutchFolkTunes/noExtraM.txt'  # Replace with the path to your input file
output_file = './DutchFolkTunes/noExtraK.txt'  # Replace with the desired path for the output file

# Call the function to clean the file
clean_text_file(input_file, output_file)



In [3]:
# Import necessary libraries
import re

# Define a function to validate and clean the text file
def validate_and_clean_text_file(input_file, output_file):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    cleaned_lines = []
    i = 0
    while i < len(lines):
        line = lines[i]

        # First condition: Remove [L lines that don't end with ] and the next 3 lines
        if line.startswith('L') and not line.strip().endswith(']'):
            i += 1  # Skip the next 3 lines as well
            continue

        # Keep the current line if all conditions are satisfied
        cleaned_lines.append(line)
        i += 1

    # Write the cleaned content back to a new file
    with open(output_file, 'w') as file:
        file.writelines(cleaned_lines)
        
    # Specify the input and output file paths
input_file = './DutchFolkTunes/OriginalDatasetNoBrackets.txt'  # Replace with the path to your input file
output_file = './DutchFolkTunes/OriginalDatasetNoL.txt'  # Replace with the desired path for the output file
# Call the function to clean the file
validate_and_clean_text_file(input_file, output_file)

In [5]:
def validate_c_lines(input_file, output_file):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    cleaned_lines = []
    for i in range(len(lines)):
        line = lines[i]

        # Check if the line starts with 'c'
        if line.startswith('c'):
            # Ensure the previous line contains '[K'
            if i == 0 or not lines[i - 1].startswith('[K'):
                continue  # Skip the 'c' line if the condition is not met

        # Add the line to the cleaned output
        cleaned_lines.append(line)

    # Write the cleaned content back to a new file
    with open(output_file, 'w') as file:
        file.writelines(cleaned_lines)

# Specify the input and output file paths
input_file = './DutchFolkTunes/filteredv2.txt'  # Replace with the path to your input file
output_file = './DutchFolkTunes/filteredv3.txt'  # Replace with the desired path for the output file
# Call the function to clean the file
validate_c_lines(input_file, output_file)

In [11]:
def extract_first_20000_lines(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for i, line in enumerate(infile):
            if i >= 20000:
                break
            outfile.write(line)

# Define the input and output file paths
input_file = './data/data_v2'  # Replace with the path to your input file
output_file = './data/data_v2Small'  # Replace with the desired output file path

# Extract the first 20000 lines
extract_first_20000_lines(input_file, output_file)