# DataSet Creation

## encoder Test

In [1]:
import zipfile
import os
import json
import librosa
import numpy as np

In [2]:
from audiodiffusion.audio_encoder import AudioEncoder

audio_encoder = AudioEncoder.from_pretrained("teticio/audio-encoder")

In [3]:
!ls

1701171760.ogg                       [1m[36mbeatmaps[m[m
1701776382.mc                        [1m[36mbeatmaps_unzip[m[m
1701849827.mc                        c5234ac9383bb69ebe8bd619a5fed4b9.mcz
1701864148.mc                        tmp.wav
1_dataset_creation.ipynb


In [4]:
import soundfile

x , sr = librosa.load("1701171760.ogg", sr=20000)
soundfile.write("tmp.wav", x, sr)

In [83]:
def get_audio_features(audio_file, bpm, position, offset):
    x , sr = librosa.load(audio_file, sr=20000)
    one_beat = 60 / bpm
    beat = position * one_beat + offset/1000
    
    start = beat - one_beat / 8
    end = beat + one_beat / 8
    
    if start < 0:
        start = 0
    
    start_index = int(sr * start)
    end_index = int(sr * end)
    
    
    soundfile.write("tmp.wav", x[start_index:end_index], sr)
    
    
    return audio_encoder.encode(["tmp.wav"]).numpy()[0].tolist()

In [84]:
get_audio_features("1701171760.ogg", 200, 100.5, 150)

[22.779273986816406,
 113.05550384521484,
 -96.74163055419922,
 96.7263412475586,
 -106.27220916748047,
 11.222976684570312,
 -1.8523142337799072,
 46.24934768676758,
 -108.57376861572266,
 10.665033340454102,
 25.597043991088867,
 80.19082641601562,
 -17.686424255371094,
 159.30618286132812,
 53.294593811035156,
 126.19649505615234,
 48.52601623535156,
 24.32500648498535,
 -70.05785369873047,
 -78.19749450683594,
 -3.8685147762298584,
 12.262463569641113,
 -6.947540283203125,
 167.14097595214844,
 -142.86976623535156,
 286.0278015136719,
 -136.9066162109375,
 154.72434997558594,
 171.17295837402344,
 73.41738891601562,
 34.36893081665039,
 10.558130264282227,
 11.147315979003906,
 111.96158599853516,
 3.2329182624816895,
 33.30107498168945,
 288.6305847167969,
 -123.21646118164062,
 -22.744016647338867,
 157.8203582763672,
 50.84153366088867,
 -81.56290435791016,
 64.32252502441406,
 -52.34855270385742,
 6.465668678283691,
 21.01951026916504,
 110.18569946289062,
 -143.1083984375,
 19

## beatmaps filter

In [7]:
mcz_files = os.listdir("beatmaps")

In [8]:
count = 0
for mcz_file in mcz_files:
    if ".mcz" not in mcz_file:
        continue
    print(count, mcz_file)
    zFile = zipfile.ZipFile("beatmaps/" + mcz_file, "r")
    audio_file = ""
    mc_file = ""
    mc_data = {}
    for fileM in zFile.namelist():
        zFile.extract(fileM, './beatmaps_unzip')
    count += 1

0 miles away.mcz
1 fb98dd5bc1967db702ece05a84168d10.mcz
2 c5234ac9383bb69ebe8bd619a5fed4b9.mcz
3 初无改.mcz
4 Lose Control.mcz
5 4c9ddc941b482828c06751c6130fb394.mcz
6 NO.4 究极158秒.mcz
7 Lyrith -迷宮リリス-.mcz
8 Liv1ng 1n The F4st Lan3.mcz
9 efb3f1c6f7ab736db4f01e136b3f074a.mcz
10 d86c199af5abdaa72d2b0ae7a2043395.mcz
11 Cosmos Part.1.mcz
12 1de08fd5a808e09089530d115bb80ee9.mcz
13 Oshama Scramble! .mcz
14 ad1a2cf5b7c7da7039403faf80193bb0.mcz
15 7cd0467ce242192a6dd46b231f717799.mcz
16 望影の方舟Six.mcz
17 d7674e4e71b81539b2febfdd00dc5ba1.mcz
18 巴别塔.mcz
19 Ops：Limone.mcz
20 Loli Bomb (Speed up&Cut ver.).mcz
21 49cb1143d185f28e11e26ed546b03152.mcz
22 Cross the Edge.mcz
23 79eed254a0a09e48e436597d557910b6.mcz
24 The Multiverse(ft.oa).mcz


In [18]:
def check_dance3_beatmap(json_data):
    # 解析JSON数据
    data = json.loads(json_data)
    
    # 获取note列表
    notes = data['note']
    # 初始化最大column值
    max_column = -1
    
    # 遍历note，找到最大的column值
    for note in notes:
        if 'column' in note:
            if note['column'] > max_column:
                max_column = note['column']
    # 判断最大column值是否为6
    if max_column == 5:
        return True
    else:
        return False

In [19]:
mc_list = []
for fname in os.listdir("beatmaps_unzip"):
    if fname.endswith(".mc"):
        print(fname)
        with open(f"beatmaps_unzip/{fname}") as f:
            if check_dance3_beatmap(f.read()):
                mc_list.append(f"beatmaps_unzip/{fname}")

1701864148.mc
1652750857.mc
1696635166.mc
1695401434.mc
1663213760.mc
1664454762.mc
1652754796.mc
1688283329.mc
Σ╕¡τ║º.mc
1697989502.mc
1692538137.mc
1701776382.mc
1659541622.mc
1701849827.mc
1696637398.mc
1697869040.mc
1695401180.mc
Θ½ÿτ║º.mc
σê¥τ║º.mc
1703703071.mc
1688309494.mc
1700696948.mc
1669440136.mc
1688308799.mc
1669621668.mc
1694879415.mc
1669944775.mc
1696483497.mc
1648042355.mc
1698898550.mc
1669439231.mc
1694955584.mc
1692883511.mc
1703913735.mc


In [21]:
for fname in os.listdir("beatmaps_unzip/0"):
    if fname.endswith(".mc"):
        print(fname)
        with open(f"beatmaps_unzip/0/{fname}") as f:
            if check_dance3_beatmap(f.read()):
                mc_list.append(f"beatmaps_unzip/0/{fname}")

1644412671.mc
1587962568.mc
1689340595.mc
1659720056.mc
1588339363.mc
1689746687.mc
1658244850.mc
Mujinku-Vacuum Track#ADD8E6- (Dance Cube Hard Lv.18).mc
1587884900.mc
1667803736.mc


In [22]:
mc_list

['beatmaps_unzip/1701864148.mc',
 'beatmaps_unzip/1652750857.mc',
 'beatmaps_unzip/1696635166.mc',
 'beatmaps_unzip/1695401434.mc',
 'beatmaps_unzip/1663213760.mc',
 'beatmaps_unzip/1664454762.mc',
 'beatmaps_unzip/1652754796.mc',
 'beatmaps_unzip/1688283329.mc',
 'beatmaps_unzip/Σ╕¡τ║º.mc',
 'beatmaps_unzip/1697989502.mc',
 'beatmaps_unzip/1692538137.mc',
 'beatmaps_unzip/1701776382.mc',
 'beatmaps_unzip/1659541622.mc',
 'beatmaps_unzip/1701849827.mc',
 'beatmaps_unzip/1696637398.mc',
 'beatmaps_unzip/1697869040.mc',
 'beatmaps_unzip/1695401180.mc',
 'beatmaps_unzip/Θ½ÿτ║º.mc',
 'beatmaps_unzip/σê¥τ║º.mc',
 'beatmaps_unzip/1703703071.mc',
 'beatmaps_unzip/1688309494.mc',
 'beatmaps_unzip/1700696948.mc',
 'beatmaps_unzip/1688308799.mc',
 'beatmaps_unzip/1669621668.mc',
 'beatmaps_unzip/1694879415.mc',
 'beatmaps_unzip/1669944775.mc',
 'beatmaps_unzip/1696483497.mc',
 'beatmaps_unzip/1648042355.mc',
 'beatmaps_unzip/1698898550.mc',
 'beatmaps_unzip/1669439231.mc',
 'beatmaps_unzip/16949

In [23]:
len(mc_list)

43

In [52]:
from math import gcd


def get_mc_time_features(fpath):
    # 读取谱面数据
    with open(fpath, 'r') as f:
        chart_data = json.load(f)

    # 获取note信息
    notes = chart_data['note']

    # 定义一个字典来存储每个时间点的特征数组
    time_features = {}

    # 遍历每个note
    for note in notes:
        beat = note['beat']
        numerator = beat[1]  # 分数拍分子
        denominator = beat[2]  # 分数拍分母

        if numerator == 0:
            denominator = 1
        else:
            _gcd = gcd(numerator, denominator)
            numerator = int(numerator / _gcd)
            denominator = int(denominator / _gcd)

        # 将时间戳化简为最小单位
        time = (beat[0], numerator, denominator)
        if "column" in note:
            column = note['column']
            # 初始化这个时间点的特征数组
            if time not in time_features:
                time_features[time] = [0] * 6

            # 根据note类型设置特征值
            if 'endbeat' in note:
                time_features[time][column-1] = 2  # 长按键
            else:
                time_features[time][column-1] = 1  # 普通按键
    return [it for it in sorted(time_features.items())]

In [56]:
# 测试下
test_path = 'beatmaps_unzip/0/1667803736.mc'
time_features = get_mc_time_features(test_path)

# 输出每个时间点的特征数组
for time, features in time_features:
    print(f"时间点: {time}, 特征数组: {features}")

时间点: (2, 0, 1), 特征数组: [0, 0, 1, 0, 0, 1]
时间点: (3, 0, 1), 特征数组: [0, 0, 1, 0, 0, 1]
时间点: (4, 0, 1), 特征数组: [0, 0, 1, 0, 0, 1]
时间点: (5, 0, 1), 特征数组: [0, 0, 1, 0, 0, 1]
时间点: (6, 0, 1), 特征数组: [1, 0, 1, 0, 0, 0]
时间点: (7, 0, 1), 特征数组: [0, 0, 0, 1, 0, 1]
时间点: (8, 0, 1), 特征数组: [1, 0, 1, 0, 0, 0]
时间点: (9, 0, 1), 特征数组: [0, 0, 0, 1, 0, 1]
时间点: (10, 0, 1), 特征数组: [0, 1, 0, 0, 1, 0]
时间点: (11, 0, 1), 特征数组: [0, 1, 0, 0, 1, 0]
时间点: (12, 0, 1), 特征数组: [0, 1, 0, 0, 1, 0]
时间点: (13, 0, 1), 特征数组: [0, 1, 0, 0, 1, 0]
时间点: (14, 0, 1), 特征数组: [1, 0, 0, 0, 1, 0]
时间点: (15, 0, 1), 特征数组: [0, 1, 0, 1, 0, 0]
时间点: (16, 0, 1), 特征数组: [1, 0, 0, 0, 1, 0]
时间点: (17, 0, 1), 特征数组: [0, 1, 0, 1, 0, 0]
时间点: (18, 0, 1), 特征数组: [1, 0, 1, 0, 0, 0]
时间点: (19, 0, 1), 特征数组: [0, 1, 0, 0, 1, 0]
时间点: (20, 0, 1), 特征数组: [0, 0, 0, 1, 0, 1]
时间点: (21, 0, 1), 特征数组: [0, 1, 0, 0, 1, 0]
时间点: (22, 0, 1), 特征数组: [1, 0, 1, 0, 0, 0]
时间点: (23, 0, 1), 特征数组: [0, 1, 0, 0, 1, 0]
时间点: (24, 0, 1), 特征数组: [0, 0, 0, 1, 0, 1]
时间点: (25, 0, 1), 特征数组: [1, 0, 0, 0, 0, 0]


In [57]:
len(time_features)

776

In [64]:
import random

def split_array_with_context(array, min_length=50, max_length=80, context_length=20):
    result = []
    idx = 0

    while idx < len(array):
        # 随机选择子数组长度
        subarray_length = random.randint(min_length, max_length)
        
        # 确保不超出数组范围
        remaining_length = len(array) - idx
        if remaining_length < subarray_length:
            subarray_length = remaining_length

        # 计算上文的起始索引
        context_start = max(0, idx - context_length)

        # 切分子数组并添加上文信息
        subarray = array[context_start:idx] + array[idx:idx + subarray_length]
        
        # 添加到结果列表
        result.append(subarray)

        # 更新索引
        idx += subarray_length

    return result

# 示例数组
array = list(range(len(time_features)))

# 切分数组
split_arrays = split_array_with_context(array)

# 打印结果
for idx, subarray in enumerate(split_arrays):
    print(f"Subarray {idx+1}: Length={len(subarray)}, Values={subarray}")

Subarray 1: Length=64, Values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]
Subarray 2: Length=100, Values=[44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143]
Subarray 3: Length=88, Values=[124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 1

### 编写生成最后数据的函数

In [67]:
test_path = 'beatmaps_unzip/0/1667803736.mc'

with open(test_path, "r") as f:
    json_data = json.loads(f.read())

In [68]:
json_data

{'meta': {'$ver': 0,
  'creator': '',
  'background': '-4179486dc2bb3a51.jpg',
  'version': 'Dance Cube Eazy Lv.11',
  'id': 0,
  'mode': 0,
  'time': 1667806217,
  'song': {'title': 'Loli Bomb (Speed up&Cut ver.)',
   'artist': 'Slax',
   'id': 28374,
   'titleorg': 'Loli Bomb (Speed up&Cut ver.)',
   'artistorg': 'Slax'},
  'mode_ext': {'column': 6, 'bar_begin': 0}},
 'time': [{'beat': [0, 0, 1], 'bpm': 214.5}],
 'effect': [],
 'note': [{'beat': [2, 0, 4], 'column': 0},
  {'beat': [2, 0, 4], 'column': 3},
  {'beat': [3, 0, 4], 'column': 0},
  {'beat': [3, 0, 4], 'column': 3},
  {'beat': [4, 0, 4], 'column': 0},
  {'beat': [4, 0, 4], 'column': 3},
  {'beat': [5, 0, 4], 'column': 0},
  {'beat': [5, 0, 4], 'column': 3},
  {'beat': [6, 0, 4], 'column': 1},
  {'beat': [6, 0, 4], 'column': 3},
  {'beat': [7, 0, 4], 'column': 0},
  {'beat': [7, 0, 4], 'column': 4},
  {'beat': [8, 0, 4], 'column': 1},
  {'beat': [8, 0, 4], 'column': 3},
  {'beat': [9, 0, 4], 'column': 0},
  {'beat': [9, 0, 4

In [69]:
bpm = json_data["time"][0]["bpm"]

bpm

214.5

In [72]:
import os

dir_ = os.path.dirname(test_path)
print(dir_)

beatmaps_unzip/0


In [76]:
audio_name = json_data["note"][-1]["sound"]
audio_path = os.path.join(dir_, audio_name)
audio_path

'beatmaps_unzip/0/1644412671.ogg'

In [78]:
json_data["note"][-1]

{'beat': [0, 0, 1],
 'sound': '1644412671.ogg',
 'vol': 100,
 'offset': 262,
 'type': 1}

In [79]:
json_data["note"][-1]["beat"] == [0, 0, 1]

True

In [108]:
def get_one_mc_data(fpath):
    """ 根据一个谱面生成数据 """
    
    ## 1. 获取基本信息bpm，audio_path，offset
    with open(fpath, "r") as f:
        json_data = json.loads(f.read())
        
    if json_data["note"][-1]["beat"] != [0, 0, 1]:
        raise Exception("谱面解析失败")
        
    if len(json_data["time"]) != 1:
        raise Exception("谱面解析失败")
    
    bpm = json_data["time"][0]["bpm"]
    audio_name = json_data["note"][-1]["sound"]
    dir_ = os.path.dirname(fpath)
    audio_path = os.path.join(dir_, audio_name)
                     
    offset = 0
    if "offset" in json_data["note"][-1]:
        offset = json_data["note"][-1]["offset"]
    
    
    ## 2. 获取时序特征
    time_features = get_mc_time_features(fpath)
    
    
    ## 3. 分割数组
    arr = list(range(len(time_features)))
    split_arrays = split_array_with_context(arr)
    
    ## 4. 提取音频特征
    audio_feature_map = {}
    for i in arr:
        time_info = time_features[i][0]
        position = time_info[0] + time_info[1] / time_info[2]
        audio_feature_map[i] = get_audio_features(audio_path, bpm, position, offset)
    
    ## 5. 生成结果数据
    inputs = []
    features = []
    
    for subarray in split_arrays:
        input_ = []
        feature_ = []
        for idx in subarray:
            input_.append(np.array(audio_feature_map[idx]))
            feature_.append(np.array(time_features[idx][1]))
    
        inputs.append(input_)
        features.append(feature_)
        
    return inputs, features

In [88]:
inputs, features = get_one_mc_data(test_path)

In [89]:
inputs

[[[22.77056121826172,
   114.47603607177734,
   -97.2273178100586,
   96.42029571533203,
   -107.72013854980469,
   11.17009162902832,
   -2.0630874633789062,
   46.32745361328125,
   -109.4720458984375,
   12.109869003295898,
   26.100624084472656,
   80.23246002197266,
   -17.691774368286133,
   160.4676971435547,
   55.06764221191406,
   126.42029571533203,
   48.686744689941406,
   25.26984214782715,
   -70.95433044433594,
   -78.5037841796875,
   -5.446882247924805,
   9.929266929626465,
   -6.988659858703613,
   168.6952667236328,
   -143.38986206054688,
   288.96270751953125,
   -137.8250274658203,
   155.2602081298828,
   175.41326904296875,
   74.62257385253906,
   34.84693908691406,
   9.674856185913086,
   12.038393020629883,
   114.78364562988281,
   3.6609086990356445,
   32.33864974975586,
   291.4024658203125,
   -123.70960998535156,
   -22.28220558166504,
   156.79298400878906,
   51.71268844604492,
   -81.38935852050781,
   64.76298522949219,
   -53.176204681396484,
  

In [104]:
features

[[[0, 0, 1, 0, 0, 1],
  [0, 0, 1, 0, 0, 1],
  [0, 0, 1, 0, 0, 1],
  [0, 0, 1, 0, 0, 1],
  [1, 0, 1, 0, 0, 0],
  [0, 0, 0, 1, 0, 1],
  [1, 0, 1, 0, 0, 0],
  [0, 0, 0, 1, 0, 1],
  [0, 1, 0, 0, 1, 0],
  [0, 1, 0, 0, 1, 0],
  [0, 1, 0, 0, 1, 0],
  [0, 1, 0, 0, 1, 0],
  [1, 0, 0, 0, 1, 0],
  [0, 1, 0, 1, 0, 0],
  [1, 0, 0, 0, 1, 0],
  [0, 1, 0, 1, 0, 0],
  [1, 0, 1, 0, 0, 0],
  [0, 1, 0, 0, 1, 0],
  [0, 0, 0, 1, 0, 1],
  [0, 1, 0, 0, 1, 0],
  [1, 0, 1, 0, 0, 0],
  [0, 1, 0, 0, 1, 0],
  [0, 0, 0, 1, 0, 1],
  [1, 0, 0, 0, 0, 0],
  [0, 1, 1, 0, 0, 0],
  [0, 1, 1, 0, 0, 0],
  [1, 0, 0, 1, 0, 0],
  [0, 0, 0, 0, 1, 1],
  [0, 2, 2, 0, 0, 0],
  [1, 0, 0, 1, 0, 0],
  [1, 0, 0, 1, 0, 0],
  [0, 0, 1, 0, 0, 1],
  [0, 0, 0, 0, 1, 1],
  [0, 0, 1, 0, 0, 1],
  [0, 0, 0, 0, 1, 1],
  [0, 1, 0, 1, 0, 0],
  [0, 0, 1, 1, 0, 0],
  [0, 1, 0, 1, 0, 0],
  [0, 0, 1, 1, 0, 0],
  [1, 0, 0, 0, 1, 0],
  [1, 1, 0, 0, 0, 0],
  [1, 0, 0, 0, 1, 0],
  [1, 1, 0, 0, 0, 0],
  [0, 1, 0, 1, 0, 0],
  [1, 0, 1, 0, 0, 0],
  [0, 0, 0

### 生成全部数据

In [128]:
X = []
y = []

# 这里其实还可以在优化下，因为特征提取的比较慢，有重复的歌曲
for idx, mc_file in enumerate(mc_list):
    print(idx, mc_file)
    try:
        inputs, features = get_one_mc_data(mc_file)
        X.extend(inputs)
        y.extend(features)
    except Exception as e:
        print("An exception occurred:", e)

0 beatmaps_unzip/1701864148.mc
1 beatmaps_unzip/1652750857.mc
2 beatmaps_unzip/1696635166.mc
3 beatmaps_unzip/1695401434.mc
4 beatmaps_unzip/1663213760.mc
5 beatmaps_unzip/1664454762.mc
6 beatmaps_unzip/1652754796.mc
7 beatmaps_unzip/1688283329.mc
8 beatmaps_unzip/Σ╕¡τ║º.mc
An exception occurred: [Errno 2] No such file or directory: 'beatmaps_unzip/（剪辑）Lose Control.ogg'
9 beatmaps_unzip/1697989502.mc
10 beatmaps_unzip/1692538137.mc
11 beatmaps_unzip/1701776382.mc
12 beatmaps_unzip/1659541622.mc
13 beatmaps_unzip/1701849827.mc
14 beatmaps_unzip/1696637398.mc
15 beatmaps_unzip/1697869040.mc
16 beatmaps_unzip/1695401180.mc
17 beatmaps_unzip/Θ½ÿτ║º.mc
An exception occurred: [Errno 2] No such file or directory: 'beatmaps_unzip/（剪辑）Lose Control.ogg'
18 beatmaps_unzip/σê¥τ║º.mc
An exception occurred: [Errno 2] No such file or directory: 'beatmaps_unzip/（剪辑）Lose Control.ogg'
19 beatmaps_unzip/1703703071.mc
20 beatmaps_unzip/1688309494.mc
21 beatmaps_unzip/1700696948.mc
22 beatmaps_unzip/168830

In [129]:
len(X)

415

In [130]:
len(y)

415

In [131]:
import numpy as np

np.save("X.npy", X)

In [132]:
np.save("y.npy", y)