# 1. Enviornment setup

In [2]:
import json
import os
import numpy as np
import csv

folder_path = './data'
output_path = 'output.csv' # output of non-normalized file
norm_output_path = 'normalized_output.csv' # output of normalized file


# this is use for convert opencpop txt to .ds for training
txt_file_path = './data/transcriptions.txt' #opencpop training file
ds_file_path = './data/opencpop.ds' # opencpop output

# 2. Preprocessing
Process the file and make them to a csv file for training

### 2.1 Turn opencpop into .ds file

In [3]:
# Function to read the .txt file and process each segment
def process_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        segments = file.read().split('\n')

    ds_data = []
    for segment in segments:
        if segment.strip():
            parts = segment.split('|')
            if len(parts) >= 5:
                ds_entry = {
                    'ph_seq': parts[2].strip(),
                    'note_seq': parts[3].strip(),
                    'ph_dur': parts[4].strip()
                }
                ds_data.append(ds_entry)

    return ds_data

# Process the file
ds_data = process_txt_file(txt_file_path)

# Convert to .ds format (JSON) and write to a file
with open(ds_file_path, 'w', encoding='utf-8') as file:
    json.dump(ds_data, file, ensure_ascii=False, indent=4)

print(f"Data has been converted and written to {ds_file_path}")

Data has been converted and written to ./data/opencpop.ds


### 2.2 Process all .ds file to dataset file

In [4]:
def read_ds_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Process the .ds file and flatten the data for CSV, including additional calculations
def process_data(data):
    processed_data = []
    for segment in data:
        # Calculate the sum of ph_dur
        ph_dur_list = [float(dur) for dur in segment.get('ph_dur', '').split()]
        sum_ph_dur = sum(ph_dur_list)

        # Count the elements in sequences
        count_ph_dur = len(segment.get('ph_dur', '').split())
        count_ph_seq = len(segment.get('ph_seq', '').split())
        count_note_seq = len(segment.get('note_seq', '').split())
        count_f0_seq = len(segment.get('f0_seq', '').split())

        # Calculate total f0_time
        f0_timestep = float(segment.get('f0_timestep', 0))
        total_f0_time = f0_timestep * count_f0_seq

        flattened_segment = {
            'offset': segment.get('offset', ''),
            'ph_seq': segment.get('ph_seq', ''),
            'ph_dur': segment.get('ph_dur', ''),
            'note_seq': segment.get('note_seq', ''),
            'f0_seq': segment.get('f0_seq', ''),
            'f0_timestep': segment.get('f0_timestep', ''),
            'sum_ph_dur': sum_ph_dur,
            'count_ph_dur': count_ph_dur,
            'count_ph_seq': count_ph_seq,
            'count_note_seq': count_note_seq,
            'count_f0_seq': count_f0_seq,
            'total_f0_time': total_f0_time
        }
        processed_data.append(flattened_segment)
    return processed_data


# Store all processed data from each file
all_data = []

# Iterate over each .ds file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.ds'):
        file_path = os.path.join(folder_path, filename)
        data = read_ds_file(file_path)
        all_data.extend(process_data(data))
        print(f'{filename} processed')

# Write the aggregated data to a CSV file
with open(output_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=all_data[0].keys())
    writer.writeheader()
    for data in all_data:
        writer.writerow(data)

print(f"Data has been written to {output_path}")

00_sampleshort_origin.ds processed
00_我多想说再见啊.ds processed
01_逍遥仙.ds processed
02_一半一半.ds processed
04_仙瑶.ds processed
06_不谓侠.ds processed
opencpop.ds processed
samples_左手指月改.ds processed
samples_能解答一切的答案.ds processed
samples_这么可爱真是抱歉.ds processed
仙瑶.ds processed
Data has been written to output.csv


# 3. Normalizing

In [7]:
# install the scikit if you dont have one
%pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/4e/ba/ce9bd1cd4953336a0e213b29cb80bb11816f2a93de8c99f88ef0b446ad0c/scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/43/d0/f3cd75b62e1b90f48dbf091261b2fc7ceec14a700e308c50f6a69c83d337/scipy-1.11.4-cp311-cp311-win_amd64.whl.metadata
  Downloading scipy-1.11.4-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--
     --------------------------------- ------ 51.2/60.4 kB 1.3 MB/s eta 0:00:01
     ---------------------------------------- 60.4/60.4 kB 1.6 MB/s eta 0:00:00
Collecting joblib>=1.1.1 (from scikit-learn)
  Obtaining dependency information for joblib>=1.1.1 from https://files.pythonh


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: C:\Users\tfgmo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [19]:
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler

# Load your dataset
df = pd.read_csv(output_path)
scaler = MinMaxScaler()

# Tokenize the sequences
df['ph_seq'] = df['ph_seq'].apply(lambda x: x.split())
df['note_seq'] = df['note_seq'].apply(lambda x: x.split())
df['ph_dur'] = df['ph_dur'].apply(lambda x: [float(i) for i in x.split()])
# df['f0_seq'] = df['f0_seq'].apply(lambda x: [float(i) for i in x.split()] if isinstance(x, str) else x)

# Function to scale an individual list
def scale_list(lst):
    # Reshape the list to fit the scaler's expected input shape
    reshaped = np.array(lst).reshape(-1, 1)
    # Scale the list
    scaled = scaler.fit_transform(reshaped).flatten()
    return scaled.tolist()

# Fit the scaler to the data and transform it
df['ph_dur'] = [scale_list(x) for x in df['ph_dur']]

# Get unique tokens and create a mapping to integers for ph_seq
unique_ph_tokens = set(token for seq in df['ph_seq'] for token in seq)
ph_token_to_int = {token: i for i, token in enumerate(unique_ph_tokens, start=1)}

with open('note_token_to_int.json', 'r') as file:
    note_token_to_int = json.load(file)

# Save the ph_seq token-to-int mapping to a JSON file
with open('ph_token_to_int.json', 'w', encoding='utf-8') as f:
    json.dump(ph_token_to_int, f, ensure_ascii=False, indent=4)

# Integer encode the sequences
df['ph_seq_encoded'] = df['ph_seq'].apply(lambda seq: [ph_token_to_int[token] for token in seq])
df['note_seq_encoded'] = df['note_seq'].apply(lambda seq: [note_token_to_int[token] for token in seq])

# get the max size
max_length = {df['ph_seq_encoded'].apply(len).max(), 
              df['note_seq_encoded'].apply(len).max(),
              df['ph_dur'].apply(len).max()}
max_length = max(max_length)

# Function to pad sequences with the average value
def pad_sequence(seq):
    avg_value = np.mean(seq)
    return list(seq) + [avg_value] * (max_length - len(seq))

# Apply the padding function to each sequence
df['ph_seq_encoded'] = df['ph_seq_encoded'].apply(pad_sequence)
df['note_seq_encoded'] = df['note_seq_encoded'].apply(pad_sequence)
df['ph_dur'] = df['ph_dur'].apply(pad_sequence)
df['ph_dur'] = pd.array(df['ph_dur'])

df.drop(['ph_seq', 'note_seq', 'offset', 'f0_timestep', 'f0_seq', 'count_ph_dur','count_ph_seq', 'count_note_seq','count_f0_seq','total_f0_time','sum_ph_dur'], axis=1, inplace=True)

df.to_csv(norm_output_path, index=False)

# Save the scaler to a file
with open('ph_dur_scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# 4. Decoding
decode the result after generation

In [None]:
# Load the ph_seq token-to-int mapping from a JSON file
with open('ph_token_to_int.json', 'r', encoding='utf-8') as f:
    ph_token_to_int = json.load(f)

# Load the note_seq token-to-int mapping from a JSON file
with open('note_token_to_int.json', 'r', encoding='utf-8') as f:
    note_token_to_int = json.load(f)