In [34]:
import pandas as pd

In [35]:
import os
import pandas as pd
import numpy as np
import os
from pathlib import Path

relative_path = r'C:\Users\Hsieh\Documents\nccucs\specialTopic\special_topic\src\data_process\transition__chord_matrix'


def get_chord_file(relative_path) -> list:
    """
    get the chord file path list

    :param relative: the relative path
    :return: chord_file_list
    """

    chord_file_list = []

    # get the file in the upper folder POP909
    parent_path = Path(relative_path).parent
    file_list = os.listdir(os.path.join(parent_path, 'POP909'))

    # remove file that fileName is not number
    for file in file_list:
        if not file.isdigit():
            file_list.remove(file)

    for file in file_list:

        chord_file = os.path.join(
            parent_path, 'POP909', file, 'chord_midi.txt')
        chord_file_list.append(chord_file)

    return chord_file_list


def get_beat_file(relative_path) -> list:
    """
    get the beat file path list

    :param relative: the relative path
    :return: beat_file_list
    """

    beat_file_list = []

    # get the file in the upper folder POP909
    parent_path = Path(relative_path).parent
    file_list = os.listdir(os.path.join(parent_path, 'POP909'))

    # remove file that fileName is not number
    for file in file_list:
        if not file.isdigit():
            file_list.remove(file)

    for file in file_list:

        beat_file = os.path.join(parent_path, 'POP909', file, 'beat_midi.txt')
        beat_file_list.append(beat_file)

    return beat_file_list


def get_midi_file(relative_path) -> list:
    """
    get the beat file path list

    :param relative: the relative path
    :return: beat_file_list
    """

    midi_file_list = []

    # get the file in the upper folder POP909
    parent_path = Path(relative_path).parent
    file_list = os.listdir(os.path.join(parent_path, 'POP909'))

    # remove file that fileName is not number
    for file in file_list:
        if not file.isdigit():
            file_list.remove(file)

    for file in file_list:

        midi_file = os.path.join(parent_path, 'POP909', file, f"{file}.mid")
        midi_file_list.append(midi_file)

    return midi_file_list

In [36]:
def data_preprocess(chord_df) -> pd.DataFrame:
    """
    data preprocess to remove empty chord and add start_chord and end_chord
    :param chord_df: the chord DataFrame
    :return: chord_df
    """

    chord_df['chord'] = chord_df['chord'].apply(lambda x: x.split('/')[0])
    chord_df['chord'] = chord_df['chord'].apply(lambda x: x.split('(')[0])
    return chord_df


def merge_all_df(file_list) -> pd.DataFrame:
    """
    merge all chord_df
    :param file_list: the chord file list
    :return: all_df
    """

    all_df = pd.DataFrame()

    for index, file in enumerate(file_list):

        chord_df = pd.read_csv(file, sep='\t', header=None, names=[
                               'start_time', 'end_time', 'chord'])

        chord_df = data_preprocess(chord_df)
        chord_df['song_num'] = index + 1

        all_df = pd.concat([all_df, chord_df])

    all_df.to_csv(r"csv_file/all_chord.csv", index=True, header=True)
    return all_df


all_file = get_chord_file(relative_path)
all_chord = merge_all_df(all_file)

In [37]:
# read all beat file

def read_all_beat_file(all_file):

    all_beat = []
    for index, file in enumerate(all_file):

        downBeatTimes = []
        bars = []
        # read the txt to pd
        df = pd.read_csv(file, sep=" ", header=None)

        # remove the last column
        df = df.iloc[:, :-1]

        original = df.copy()
        # if last column is 1, append first colum's value to downbeat
        for i in range(0, len(df)):
            if df.iloc[i, -1] == 1:
                downBeatTimes.append(df.iloc[i, 0])

        # calculate the bars
        for i in range(0, len(downBeatTimes)-1):
            bars.append([downBeatTimes[i], downBeatTimes[i+1]])
        bars.append([bars[-1][1], df.iloc[-1, 0]])

        all_beat.append(bars)

    return all_beat

In [38]:
import pretty_midi
import pandas as pd

pitch_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']


def process_row(row, song_num):

    start = row['Timestamp'][0]
    end = row['Timestamp'][1]

    # make copy of df_chord
    df_chord_copy = all_chord[all_chord['song_num'] == song_num].copy()
    # calculate the gap between start and end
    df_chord_copy["gap"] = df_chord_copy.apply(lambda x: abs(
        x['start_time'] - start) + abs(x['end_time'] - end), axis=1)

    # find the min gap
    min_gap = df_chord_copy['gap'].min()

    # find the chord
    chord = df_chord_copy[df_chord_copy['gap'] == min_gap]['chord'].values[0]
    return chord


def midi_note_to_pitch(midi_note) -> str:
    """
    the function to convert midi note to pitch
    param midi_note: int
    return: str
    """

    # Equal temperament

    octave = (midi_note - 12) // 12 + 1
    pitch_class = midi_note % 12
    pitch_name = pitch_names[pitch_class]
    return f'{pitch_name}'


def split_midi_to_measure(midi_file: str, time_section: list) -> list:
    """
    the function to split midi song into list of measure

    Args:
        midi_file (str): the midi file name
        time_section (list): the time section of the midi
    Returns:
        the list of note split by measure    
    """

    midi_data = pretty_midi.PrettyMIDI(midi_file)

    measure_list = []

    # accroding to time section to find the note events
    for i in range(0, len(time_section)):
        measure = []

        for note in midi_data.instruments[0].notes:
            if note.start >= time_section[i][0] and note.start < time_section[i][1]:
                measure.append(midi_note_to_pitch(note.pitch))

        measure_list.append(measure)

    return measure_list


def convert_bar_notes_to_12d_vector(bars_notes: list) -> list:
    """
    the function to convert bar notes to 12d vector
    C, C#, D, D#, E, F, F#, G, G#, A, A#, B
    param bars_notes: list
    return: list
    """
    bars_notes_vector = []
    # 12d vector to represent the note show times
    # if the note show in the bar, the value will add 1
    # if the note not show in the bar, the value will be 0

    for i in range(0, len(bars_notes)):
        note_vector = [0] * 12
        for note in bars_notes[i]:

            if note not in pitch_names:
                print(note)
            note_vector[pitch_names.index(note)] += 1

        bars_notes_vector.append(note_vector)

    return bars_notes_vector


def data_preprocess_pipeline(file, bar, index):

    song_num = index + 1
    bars_notes = split_midi_to_measure(file, bar)
    bars_notes_vector = convert_bar_notes_to_12d_vector(bars_notes)

    df = pd.DataFrame({
        'Melody': bars_notes_vector,
        'Timestamp': bar,
        'song_num': song_num,
    })

    df['chord'] = df.apply(process_row, axis=1, args=(song_num,))

    return df

In [39]:

all_beat = read_all_beat_file(get_beat_file(relative_path))
all_midi = get_midi_file(relative_path)


def merge_all_df_dataset(all_midi, all_beat):
    merge_df = pd.DataFrame()
    for i in range(0, len(all_midi)):

        print(f"processing {i+1} song")

        # Print %
        print(f"{round((i+1)/len(all_midi)*100,2)}%")

        df = data_preprocess_pipeline(all_midi[i], all_beat[i], i)
        merge_df = pd.concat([merge_df, df])

    return merge_df


merge_df = merge_all_df_dataset(all_midi, all_beat)
merge_df.to_csv(r"csv_file/merge_df_dataset.csv", index=True, header=True)
merge_df

KeyboardInterrupt: 

In [63]:
# read csv as pd

# remove colum chord where is n and melody where is [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


df = pd.read_csv(r"csv_file/merge_df_dataset.csv", index_col=0)

df = df[df['chord'] != 'N']
df = df[df["Melody"] != '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']



def adree_not_in_top_24 (chord):
   
   # replace number with "" in str


    chord = chord.replace("1", "")
    chord = chord.replace("2", "")
    chord = chord.replace("3", "")
    chord = chord.replace("4", "")
    chord = chord.replace("5", "")
    chord = chord.replace("6", "")
    chord = chord.replace("7", "")
    chord = chord.replace("8", "")
    chord = chord.replace("9", "")
    chord = chord.replace("0", "")
        #replace "sus" with "maj"
    
    chord = chord.replace("sus", "maj")
       #replace "dim" with "min"
    
    chord = chord.replace("dim", "min")   
    #replace "minmaj", "hmin" with "min"
    
    chord = chord.replace("minmaj", "min") 
    chord = chord.replace("hmin", "min")  
    #replace "aug"with "maj"
    
    chord = chord.replace("aug", "maj") 
        
    #if last word is ":"
    if chord[-1] == ":":
        #append "maj"
        chord = chord + "maj"
      
   
   
    return chord

#convert chord to 24 chord using adree_not_in_top_24 (chord)

df['chord'] = df['chord'].apply(adree_not_in_top_24)

#save to csv file
df.to_csv(r"csv_file/merge_df_dataset.csv", index=True, header=True)


