# Import libraries

In [1]:
import json
import numpy as np
import pandas as pd
import os
import random
from time import time
from tqdm import tqdm

from predict import Wav2Vec2Aligner

In [None]:
# Define some necessary variables

# Define input data and result paths
# Modify these variables if you want the model to read and write data to different paths
DATA_PATH = '../data'
RESULT_PATH = '../result'
SONGS_PATH = os.path.join(DATA_PATH, 'songs')
LYRICS_PATH = os.path.join(DATA_PATH, 'lyrics')
GROUNDTRUTH_LYRICS_PATH = os.path.join(DATA_PATH, 'groundtruth')
TIME_SUBMISSION_PATH = os.path.join(RESULT_PATH, 'time_submission.csv')
JUPYTER_SUBMISSION_PATH = os.path.join(RESULT_PATH, 'jupyter_submission.zip')

LYRICS_FILE_EXTENSION = 'json'
MODEL_PATH = 'not-tanh/wav2vec2-large-xlsr-53-vietnamese'

# Ensure result directory is created
if not os.path.exists(RESULT_PATH):
    os.makedirs(RESULT_PATH)

# Load model

In [3]:
model = Wav2Vec2Aligner(MODEL_PATH, cuda=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Blank Token id [PAD]/<pad> 106


# Read test cases and run predictions

**Note**: The model in this notebook is evaluated on a subset of the training set (100 samples or 10%) provided by the Zalo AI Challenge host since (1) the size of the training set is large and (2) it takes considerable amount of time to infer on each sample. 

In [4]:
# Read test cases
test_cases = []
num_test_cases = 100

for song_file in random.sample(os.listdir(SONGS_PATH), num_test_cases):
    filename = song_file.split('.')[0]
    lyrics_file = os.path.join(LYRICS_PATH, f'{filename}.{LYRICS_FILE_EXTENSION}')
    
    with open(lyrics_file, 'r', encoding='utf-8') as f:
        label = json.load(f)

        lyric = []
        num_words_per_sentence = []
        for sentence in label:
            num_words = 0
            for word in sentence['l']:
                lyric.append(word['d'])
                num_words += 1
            num_words_per_sentence.append(num_words)

        test_cases.append(
            {
                "sent": lyric,
                "num_words": num_words_per_sentence,
                "wav_path": os.path.join(SONGS_PATH, song_file),
                "out_path": os.path.join(RESULT_PATH, filename + '.json')
            }
        )

In [5]:
# Run prediction
prediction_times = []
for i, item in enumerate(tqdm(test_cases)):
    t1 = time()
    model.align_single_sample(item)
    t2 = time()

    file_name = os.path.basename(item['wav_path'])
    prediction_time = int(t2 * 1000 - t1 * 1000)
    
    prediction_times.append((file_name, prediction_time))

time_submission_df = pd.DataFrame(data=prediction_times, columns=['fname', 'time'])
    

  0%|          | 0/100 [00:00<?, ?it/s]

INFO:tensorflow:Apply unet for vocals_spectrogram
Instructions for updating:
Colocations handled automatically by placer.
INFO:tensorflow:Apply unet for accompaniment_spectrogram
INFO:tensorflow:Restoring parameters from pretrained_models\2stems\model


100%|██████████| 100/100 [24:42<00:00, 14.83s/it] 


In [6]:
time_submission_df

Unnamed: 0,fname,time
0,38313731345f3333.wav,12780
1,38313239375f3635.wav,12236
2,3132313739305f3131.wav,16472
3,38313332385f3539.wav,7469
4,38313136385f313434.wav,4716
...,...,...
95,37333439365f3238.wav,9975
96,37333531325f3233.wav,10252
97,38313338385f313338.wav,6671
98,39323838395f3137.wav,18077


# Evaluate and save data

In [7]:
# Define some helper functions

def predict_to_df(filepath):
    f = open(filepath,  encoding="utf8")
    label = json.load(f)
    columns = ['d', 's', 'e', 'key', 'sentences']
    dfs = []
    # df = pd.DataFrame(columns = ['d', 's', 'e', 'key', 'sentences'])
    sen_index = 0
    word_index = 0
    for sen in label:
        for word in sen['l']:
            d = word['d']
            s = word['s']
            e = word['e']
            dfs.append([d, s, e, d+str(word_index), sen_index])
            # df = df.append({'d':d, 's':s, 'e':e, 'key':d+str(word_index), 'sentences':sen_index}, ignore_index=True)
            word_index += 1
        sen_index += 1
    f.close()
    dfs = pd.DataFrame(dfs, columns = columns)
    return dfs

def label_to_df(filepath):
    f = open(filepath,  encoding="utf8")
    label = json.load(f)
    # df = pd.DataFrame(columns = ['d', 's', 'e', 'key', 'sentences'])
    columns = ['d', 's', 'e', 'key', 'sentences']
    dfs = []
    sen_index = 0
    word_index = 0
    for sen in label:
        for word in sen['l']:
            d = word['d']
            s = word['s']
            e = word['e']
            dfs.append([d, s, e, d+str(word_index), sen_index])
            # df = df.append({'d':d, 's':s, 'e':e, 'key':d+str(word_index), 'sentences':sen_index}, ignore_index=True)
            word_index += 1
        sen_index += 1
    f.close()
    dfs = pd.DataFrame(dfs, columns = columns)
    return dfs

In [8]:
def IoU(label_dir, predict_dir, filename):
    # config path file
    label_path = os.path.join(label_dir, filename + '.json')
    predict_path = os.path.join(predict_dir, filename + '.json')
    # read data
    df_predict = predict_to_df(predict_path)
    df_label = label_to_df(label_path)
    
    # calculate
    df_join = pd.merge(left=df_label, right=df_predict, on='key', how='left')
    df_join = df_join.fillna(0) #     fill na elements
    df_join['s_min'] = np.where((df_join['s_x'] >= df_join['s_y']), df_join['s_y'], df_join['s_x'])
    df_join['e_max'] = np.where((df_join['e_x'] >= df_join['e_y']), df_join['e_x'], df_join['e_y'])
    df_join['union'] = df_join['e_max'] - df_join['s_min']
    df_join['inter'] = df_join['union'] - np.abs(df_join['s_x']-df_join['s_y']) - np.abs(df_join['e_x']-df_join['e_y'])
    df_join['inter'] = np.where(df_join['inter']<0, 0, df_join['inter'])
    # add 1 to avoid error
    df_join['iou'] = np.round(df_join['inter']/(df_join['union']+1),2)
    
    # iou by sentence
    iou_by_sen = df_join.groupby(['sentences_x'])['iou'].mean()
    # iou total
    final_iou = iou_by_sen.mean()
    return iou_by_sen, final_iou

In [16]:
# Evaluation against groundtruth lyrics

iou_segments = []
for test_case in test_cases:
    file_id = os.path.basename(test_case['wav_path']).replace('.wav', '')
    iou_sentence, iou_segment = IoU(GROUNDTRUTH_LYRICS_PATH, RESULT_PATH, file_id)
    iou_segments.append(iou_segment)

print(f'IoU: {np.mean(iou_segments)}')

IoU: 0.45202038221738716


In [15]:
# Save prediction time
time_submission_df.to_csv(
    TIME_SUBMISSION_PATH,
    index=False, # Don't save index
)