In [1]:
import pandas as pd

In [2]:
try:
    data = pd.read_csv('./cutonomicon.csv')
except:
    data = pd.read_csv('./cutonomicon.csv', header=None, names=['chords'])

data['chords'] = data['chords'].str.replace(r'\s+', ' ', regex=True).str.strip()

def create_windowed_sequences(chord_sequence, window_size=4):
    if pd.isna(chord_sequence):  # Handle NaN values
        return []
    
    chords = chord_sequence.split()
    sequences = []
    
    if len(chords) >= window_size:
        for i in range(len(chords) - window_size + 1):
            window = chords[i:i+window_size]
            sequences.append({
                'input_chord': window[0],
                'chord_2': window[1],
                'chord_3': window[2],
                'chord_4': window[3]
            })
    
    return sequences

all_sequences = []
for _, row in data.iterrows():  # Use data directly, not df
    sequences = create_windowed_sequences(row['chords'])
    all_sequences.extend(sequences)

if all_sequences:  # Only create if we have data
    result_df = pd.DataFrame(all_sequences)
    print(f"Generated {len(result_df)} training samples")
    print(result_df.head())
else:
    print("No valid sequences found - check your input data")

Generated 39528498 training samples
  input_chord chord_2 chord_3 chord_4
0        Cmaj    Fmaj    Cmaj      E7
1        Fmaj    Cmaj      E7    Amin
2        Cmaj      E7    Amin    Cmaj
3          E7    Amin    Cmaj    Fmaj
4        Amin    Cmaj    Fmaj    Cmaj


In [3]:

# Save all 39M sequences (compressed)
result_df.to_parquet('all_chord_sequences.parquet.gzip', compression='gzip')