In [135]:
import numpy as np
import os
import librosa
from parselmouth.praat import call
import parselmouth
import math
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def read_audio(path):
    x, sr = librosa.load(path, sr=None)
    return x

In [3]:
def load_metadata(path_csv):
    df=pd.read_csv(path_csv)
    metadata=df.values
    return metadata

In [4]:
def extract_pitch_from_array(audio_arr, time_step):
    '''
    Input:

    audio_arr- array of raw audio
    time_step- time step to extract the pitch

    Return: An array of pitch values
    '''
    sr=44100
    audio = parselmouth.Sound(audio_arr, sr)
    pitch_obj = audio.to_pitch(time_step)
    pitch_arr = pitch_obj.selected_array['frequency']
    return pitch_arr

In [5]:
def extract_teacher_pitch(t_audio, t_scale, t_bpm, s_scale, s_bpm, time_step):
    '''
    Input:

    t_audio- path of audio file
    t_scale- Scale of teacher
    s_scale- Scale of student
    t_bpm- BPM of teacher
    s_bpm- BPM of student 
    time_step- time step of pitch value 


    Return: Extract pitch with necessary changes
    '''
    sr=44100
    if(t_scale == s_scale and t_bpm == s_bpm):
        return extract_pitch_from_array(t_audio, time_step)
    elif(t_scale != s_scale and t_bpm != s_bpm):
        y, sr = change_tempo(t_audio, t_bpm, s_bpm)
        y1, sr1 = change_scale_from_array(y, sr, t_scale, s_scale)
        pitch_arr = extract_pitch_from_array(y1[0], sr1, time_step=0.01)
        return pitch_arr
    elif(t_scale != s_scale and t_bpm == s_bpm):
        y, sr = change_scale_from_array(t_audio,sr, t_scale, s_scale)
        pitch_arr = extract_pitch_from_array(y[0], time_step=0.01)
        return pitch_arr
    else:
        y, sr = change_tempo(t_audio, t_bpm, s_bpm)
        pitch_arr = extract_pitch_from_array(y[0], time_step=0.01)
        return pitch_arr

In [6]:
def change_tempo(audio_file_path, actual_bpm, desired_bpm):
    '''
    Input:

    audio_file- Path of wav audio file
    actual_bpm- Actual BPM in which the audio file is recorded
    desired_bpm- Desired BPM in which the audio file needs to be changed

    Return: An array of changed audio and its sampling frequency
    '''

    factor_bpm = int(actual_bpm)/int(desired_bpm)
    sound = parselmouth.Sound(audio_file_path)
    manipulation = call(sound, "To Manipulation", 0.01, 75, 600)
    duration_tier = call(manipulation, "Extract duration tier")
    call(duration_tier, "Add point", 0, factor_bpm)
    call([duration_tier, manipulation], "Replace duration tier")
    sound_changed_tempo = call(manipulation, "Get resynthesis (overlap-add)")
    return sound_changed_tempo.values, sound_changed_tempo.sampling_frequency

In [7]:
def change_scale_from_array(y, sr, actual_scale, desired_scale):
    '''
    Input:

    y- Array of raw audio
    sr- Sampling rate of raw audio
    actual_scale- Actual scale of the raw audio
    desired scale- Desired scale in which audio needs to be changed

    Return: An array of changed audio and its sampling frequency
    '''
    factor_scale = get_scale_factor(Notes, actual_scale, desired_scale)
    sound = parselmouth.Sound(y, sr)
    manipulation = call(sound, "To Manipulation", 0.01, 75, 600)
    pitch_tier = call(manipulation, "Extract pitch tier")
    call(pitch_tier, "Multiply frequencies",
         sound.xmin, sound.xmax, factor_scale)
    call([pitch_tier, manipulation], "Replace pitch tier")
    sound_changed_scale = call(manipulation, "Get resynthesis (overlap-add)")
    return sound_changed_scale.values, sound_changed_scale.sampling_frequency

In [8]:
Notes = ['C0', 'C#0', 'D0', 'D#0', 'E0', 'F0', 'F#0', 'G0', 'G#0', 'A0', 'A#0', 'B0', 'C1', 'C#1', 'D1', 'D#1', 'E1', 'F1', 'F#1', 'G1', 'G#1', 'A1', 'A#1', 'B1', 'C2', 'C#2', 'D2', 'D#2', 'E2', 'F2', 'F#2', 'G2', 'G#2', 'A2', 'A#2', 'B2', 'C3', 'C#3', 'D3', 'D#3', 'E3', 'F3', 'F#3', 'G3', 'G#3', 'A3', 'A#3', 'B3', 'C4', 'C#4', 'D4', 'D#4', 'E4',
         'F4', 'F#4', 'G4', 'G#4', 'A4', 'A#4', 'B4', 'C5', 'C#5', 'D5', 'D#5', 'E5', 'F5', 'F#5', 'G5', 'G#5', 'A5', 'A#5', 'B5', 'C6', 'C#6', 'D6', 'D#6', 'E6', 'F6', 'F#6', 'G6', 'G#6', 'A6', 'A#6', 'B6', 'C7', 'C#7', 'D7', 'D#7', 'E7', 'F7', 'F#7', 'G7', 'G#7', 'A7', 'A#7', 'B7', 'C8', 'C#8', 'D8', 'D#8', 'E8', 'F8', 'F#8', 'G8', 'G#8', 'A8', 'A#8', 'B8']
def get_scale_factor(Notes, actual_scale, desired_scale):
    '''
    Input:

    Notes- List of notes available
    actual_scale- Actual scale of the raw audio
    desired scale- Desired scale in which audio needs to be changed

    Return: Factor by which the scale needs to be changed
    '''
    idx_actual = Notes.index(actual_scale)
    idx_desired = Notes.index(desired_scale)
    diff = idx_desired-idx_actual
    factor_scale = 2**(diff*(1/12))
    return factor_scale


In [9]:
def find_pitch_array(t_audio, s_audio, t_scale, t_bpm, s_scale, s_bpm, time_step):
    '''
    A function to call pitch extracter for teacher and student
    '''
    return extract_teacher_pitch(t_audio, t_scale, t_bpm, s_scale, s_bpm, time_step),extract_pitch_from_array(s_audio,time_step)

In [10]:
def convert_to_log_for_comparing(arr, tonic):
    '''
    Input:

    arr- list of pitch values
    tonic- Tonic of the singer

    Return: An array containing log pitch values
    '''
    tonic_log = np.log2(tonic)
    x = math.modf(tonic_log)
    tonic_log_m = x[0]
    log_arr = []
    for i in arr:
        if i == 0:
            log_arr.append(0)
        else:
            log_arr.append(np.log2(i/tonic_log_m))
    return log_arr

In [11]:
def seperate_mantissa(arr):
    '''
    Input:

    arr- list of log values

    Return: An array containing log pitch values with their mantissa removed
    '''
    new_arr = []
    for i in arr:
        x = math.modf(i)
        new_arr.append(x[0])
    return new_arr

In [12]:
def extract_frequency_features(metadata,teacher_audio_files,student_audio_files):
    time_step=0.01
    t_pitch=[]
    s_pitch=[]
    for i in range(len(metadata)):
        tonic=librosa.note_to_hz(metadata[i][4])
        t_array=teacher_audio_files[i]
        s_array=student_audio_files[i]
        teacher_pitch, student_pitch = find_pitch_array(t_array, s_array, metadata[i][5], metadata[i][3], metadata[i][4], metadata[i][2], time_step)
        t_log, s_log = convert_to_log_for_comparing(teacher_pitch, tonic), convert_to_log_for_comparing(student_pitch, tonic)
        t_log_m, s_log_m = seperate_mantissa(t_log), seperate_mantissa(s_log)
        t_pitch.append(np.array(t_log_m,dtype="object"))
        s_pitch.append(np.array(s_log_m,dtype="object"))
    return np.array(t_pitch,dtype="object"),np.array(s_pitch,dtype="object")

In [13]:
def padding(t_arr, s_arr):
    '''
    Input:

    t_arr- list of teacher pitch values
    s_arr- list of student pitch values


    Return: Padded student pitch values and teacher values of equal length
    '''
    if len(t_arr) < len(s_arr):
        pad = len(s_arr)-len(t_arr)
        t_arr = np.append(t_arr, np.zeros(pad))
        return t_arr, s_arr
    elif len(t_arr) > len(s_arr):
        pad = len(t_arr)-len(s_arr)
        s_arr = np.append(s_arr, np.zeros(pad))
        return t_arr, s_arr
    else:
        return t_arr, s_arr

In [14]:
def pad_audio_files(t_audio,s_audio):
    teacher_audio,sr = librosa.load(t_audio,sr=None)
    student_audio,sr = librosa.load(s_audio,sr=None)
    diff=abs(len(teacher_audio)-len(student_audio))
    arr=np.zeros((diff,))
    if len(teacher_audio)<len(student_audio):
        taudio=np.append(teacher_audio,arr)
        saudio=student_audio
    elif len(student_audio)<len(teacher_audio):
        saudio=np.append(student_audio,arr)
        taudio=teacher_audio
    else:
        saudio=student_audio
        taudio=teacher_audio
    return taudio,saudio

In [15]:
def load_audio_files(s_root,t_root,metadata):
    teacher_audio_files=[]
    student_audio_files=[]
    for i in metadata:
        t,s=pad_audio_files(t_root+i[1],s_root+i[0])
        teacher_audio_files.append(t)
        student_audio_files.append(s)
    return teacher_audio_files,student_audio_files

In [16]:
def replaceZeroes(data):
    data=np.array(data,dtype=object)
    min_nonzero = np.min(data[np.nonzero(data)])
    data[data == 0] = min_nonzero*0.00000001
    return data

In [17]:
def compute_log_energy(hop_length, frame_length, x):
    arr=[sum(abs(x[i:i+frame_length]**2)) for i in range(0, len(x), hop_length)]
    arr=replaceZeroes(arr)
    arr=arr.astype(float)
    log_energy = np.array(np.log(arr))
    return log_energy

In [18]:
def extract_feature_2(hop_length, frame_length,data):
    feature_2=[]
    for i in data:
        feature_2.append(compute_log_energy(hop_length, frame_length, i))
    return np.array(feature_2,  dtype=object)

In [19]:
def read_text_file(filename):
    l=[]
    with open(filename) as file:
        for line in file:
            l.append(line.rstrip())
    return l

In [20]:
def process_raw_text(lst):
    new_lst=[]
    for i in lst:
        x=i.split('\t')
        if x[-1]=="A" or x[-1]=="F":
            new_lst.append(x)    
        else:
            pass   
    return new_lst

In [21]:
def convert_time_to_index(labels):
    new_labels=[]
    for i in labels:
        x=[]
        for j in i:
            x.append([math.ceil(float(j[0])*100),math.ceil(float(j[1])*100),j[2]])
        new_labels.append(x)
    return new_labels

In [22]:
def create_framewise_labels(idx,audio_length):
    audio_files = {}
    df = pd.DataFrame(idx)
    try:
        condition = not len(df[df.shape[1]-1].unique())<2
    except:
        condition = not len(idx)==0
    if condition:
        dictionary = df[df.shape[1]-1].value_counts()
        for k,v in dict(dictionary).items():
            audio_files[f'mistake_{k}'] = np.zeros((audio_length,))
            mistake_i = np.where(df[df.shape[1]-1]==k)[0]
            for row_id in mistake_i:
                audio_files[f'mistake_{k}'][df.iloc[row_id,0]:df.iloc[row_id,1]] = 1
    else:
        audio_files['mistake_F'] = np.zeros((audio_length,))
        audio_files['mistake_A'] = np.zeros((audio_length,))
    return audio_files['mistake_F'],audio_files['mistake_A']

In [23]:
def create_framewise_labels_2(l,audio_length):
    arr_1=[0]*audio_length
    arr_2=[0]*audio_length
    for i in l:
        if i[2]=='F':
            if i[1]>audio_length:
#                 print(i[1]-audio_length)
                x=audio_length
            else:
                x=i[1]
            for j in range(i[0],x):          
                arr_1[j]=1
        elif i[2]=='A':
            
            if i[1]>audio_length:
#                 print(i[1]-audio_length)
                x=audio_length
            else:
                x=i[1]
            for j in range(i[0],x):
                arr_2[j]=1
    return arr_1,arr_2

In [24]:
def convert_labels_framewise(labels,teacher_feature_1):
    frequency_labels=[]
    amplitude_labels=[]
    for i in range(len(labels)):
        f,a=create_framewise_labels_2(labels[i],len(teacher_feature_1[i]))
        frequency_labels.append(f)
        amplitude_labels.append(a)
    return frequency_labels,amplitude_labels

In [174]:
def pad_features_and_labels(teacher_features_1,student_features_1,teacher_features_2,student_features_2,frequency_labels,amplitude_labels):
    diff=abs(len(teacher_features_1)-len(teacher_features_2))
    arr=np.zeros((diff,))
    if len(teacher_features_1)<len(teacher_features_2):
  
        teacher_features_1=np.append(teacher_features_1,arr)

        student_features_1=np.append(student_features_1,arr)
        frequency_labels=np.append(frequency_labels,arr)
        amplitude_labels=np.append(amplitude_labels,arr)
    elif len(teacher_features_2)<len(teacher_features_1):
        teacher_features_2=np.append(teacher_features_2,arr)
        student_features_2=np.append(student_features_2,arr)
    else:
        pass
    return teacher_features_1,student_features_1,teacher_features_2,student_features_2,frequency_labels,amplitude_labels    
    

In [160]:
def padd_all(teacher_features_1,student_features_1,teacher_features_2,student_features_2,frequency_labels,amplitude_labels):
    t1=[]
    s1=[]
    t2=[]
    s2=[]
    f=[]
    a=[]
    for i in range(len(teacher_features_1)):
        p,q,r,s,t,u=pad_features_and_labels(teacher_features_1[i],student_features_1[i],teacher_features_2[i],student_features_2[i],frequency_labels[i],amplitude_labels[i])
        t1.append(p)
        s1.append(q)
        t2.append(r)
        s2.append(s)
        f.append(t)
        a.append(u)
    return t1,s1,t2,s2,f,a

In [161]:
def check_nan(arr):
    for i in arr:
        for j in i:
            if np.isnan(j)==True:
                print("Hello")

In [186]:
def read_labels(metadata,root):
    labels=[]
    c=0
    for i in metadata:
        #print(i[0])
        path=root+"labels_2/"+i[0]+".txt"
        l=read_text_file(path)
        labels.append(process_raw_text(l))
        if len(l)==0:
            c+=1
    print(c)
    return labels

In [187]:
teacher_id="002"

In [188]:
path_csv="Data/"+teacher_id+"/metadata.csv"
s_root="Data/"+teacher_id+"/student/"
t_root="Data/"+teacher_id+"/teacher/"
root="Data/"+teacher_id+"/"

In [189]:
metadata=load_metadata(path_csv)

In [190]:
teacher_audio_files,student_audio_files=load_audio_files(s_root,t_root,metadata)

In [191]:
teacher_feature_1,student_feature_1=extract_frequency_features(metadata,teacher_audio_files,student_audio_files)

In [192]:
hop_length=442
frame_length=441
teacher_feature_2=extract_feature_2(hop_length, frame_length,teacher_audio_files)
student_feature_2=extract_feature_2(hop_length, frame_length,student_audio_files)

In [193]:
labels=read_labels(metadata,root)
labels=convert_time_to_index(labels)
frequency_labels,amplitude_labels=convert_labels_framewise(labels,teacher_feature_1)

0


In [194]:
teacher_features_1_padded,student_features_1_padded,teacher_features_2_padded,student_features_2_padded,frequency_labels_padded,amplitude_labels_padded=padd_all(teacher_feature_1,student_feature_1,teacher_feature_2,student_feature_2,frequency_labels,amplitude_labels)

In [202]:
np.save("Data/"+teacher_id+"/features/t_1_"+teacher_id+".npy",np.array(teacher_features_1_padded,dtype=object))
np.save("Data/"+teacher_id+"/features/s_1_"+teacher_id+".npy",np.array(student_features_1_padded,dtype=object))
np.save("Data/"+teacher_id+"/features/c_1_"+teacher_id+".npy",np.array(frequency_labels_padded,dtype=object))
np.save("Data/"+teacher_id+"/features/t_2_"+teacher_id+".npy",np.array(teacher_features_2_padded,dtype=object))
np.save("Data/"+teacher_id+"/features/s_2_"+teacher_id+".npy",np.array(student_features_2_padded,dtype=object))
np.save("Data/"+teacher_id+"/features/c_2_"+teacher_id+".npy",np.array(amplitude_labels_padded,dtype=object))