# Multi-layer Search for Embed Optimization

In [1]:
from code_book_embed import *
ms.use('seaborn-muted')
%matplotlib inline
import pickle
import scipy

# Layer 1: Waveform Grid Search

In [None]:
# try every possible combination of pairs of waveforms on man and woman speech samples
def waveform_optimize(waveform_list):
    results = {'conv':{}, 'bit': {}}
    results_ave_per_source = {'conv':{}, 'bit': {}}
    
    #paths_to_source = ["audio_samples/man2_orig.wav", "audio_samples/woman2_orig.wav"]
    base_path = "/audio_samples/Harvard_Sentences/"
    paths_to_source = [os.getcwd() + base_path + filename for filename in os.listdir(os.getcwd() + base_path)]
    
    for p in paths_to_source:
        print "Currently processing: ", p
        for w1 in waveform_list:
            for w2 in waveform_list:
                if w1 == w2:
                    continue
                else:
                    E2 = Embed(p, [w1, w2], [0,1], [0,1,0,1,0])

                    # Fix the truncation and energy values
                    E2.truncate(0.4, idx_list=[0,1])
                    E2.energy(0.3, idx_list=[0])
                    E2.energy(0.3, idx_list=[1])
                    E2.pitch_shift(-15, idx_list=[1])
                    E2.pitch_shift(-15, idx_list=[0])

                    embed2, num_total_digits = E2.get_embedded_audio(plot=False)
                    d_embed2, sr = compress_and_decompress(embed2, "compression_samples/", plot=False)

                    # get the timeseries of the the original waveforms and recover
                    wf = E2.get_data_timeseries()
                    R2 = Recover(d_embed2, wf, [0,1], [0,1,0,1,0], num_total_digits)
                    final_sequence2 = R2.get_bit_sequence(thres=0.85, plot=False)
                    bit_acc = R2.get_recovery_estimate(final_sequence2, conv=False)
                    conv_acc = R2.get_recovery_estimate(final_sequence2, conv=True)
                    
                    # results metrics per speech sample
                    metadata = str(p) + ':' + str(w1) + ':' + str(w2)
                    results['conv'][metadata] = conv_acc
                    results['bit'][metadata] = bit_acc
                    
                    # results metrics average between speech samples
                    metadata = str(w1) + ':' + str(w2)
                    try:
                        results_ave_per_source['conv'][metadata] += (float(conv_acc) / len(paths_to_source))
                        results_ave_per_source['bit'][metadata] += (float(bit_acc) / len(paths_to_source))
                    except KeyError:
                        results_ave_per_source['conv'][metadata] = (float(conv_acc) / len(paths_to_source))
                        results_ave_per_source['bit'][metadata] = (float(bit_acc) / len(paths_to_source))
                    
    return results, results_ave_per_source
    

In [None]:
waveform_list = ["speech_samples/pronunciation_en_zero2.mp3", "speech_samples/pronunciation_en_one.mp3", 
                 "speech_samples/pronunciation_en_five.mp3", "speech_samples/pronunciation_en_seven.mp3",
                 "speech_samples/pronunciation_en_nine.mp3"]

results_dict, results_ave_dict = waveform_optimize(waveform_list)

In [None]:
pickle.dump( results_dict, open( "results_waveform.pkl", "wb" ))
pickle.dump( results_ave_dict, open( "results_ave_waveform.pkl", "wb" ))

In [None]:
print "Accuracy per source"
sorted_res = sorted([(key, value) for key, value in results_dict.iteritems()], key = lambda x: x[1])
for ele in sorted_res[-5:]:
    print ele, "\n"
    
print "Average accuracy across sources"
sorted_res_ave = sorted([(key, value) for key, value in results_ave_dict.iteritems()], key = lambda x: x[1])
for ele in sorted_res_ave[-5:]:
    print ele, "\n"

# Layer 2: Length, Pitch, Energy - Simplex Optimization

In [2]:
# fixed waveform - for now
def system_accuracy(p, W, pf0, pf1, ef0, ef1, lf):    
    W = [w1, w2]
    E2 = Embed(p, [w1, w2], [0,1], [0,1,0,1,0])

    # Fix the truncation and energy values
    E2.truncate(lf, idx_list=[0,1])
    E2.energy(ef0, idx_list=[0])
    E2.energy(ef1, idx_list=[1])
    E2.pitch_shift(pf0, idx_list=[0])
    E2.pitch_shift(pf1, idx_list=[1])

    embed2, num_total_digits = E2.get_embedded_audio(plot=False)
    d_embed2, sr = compress_and_decompress(embed2, "compression_samples/", plot=False)

    # get the timeseries of the the original waveforms and recover
    wf = E2.get_data_timeseries()
    R2 = Recover(d_embed2, wf, [0,1], [0,1,0,1,0], num_total_digits)
    final_sequence2 = R2.get_bit_sequence(thres=0.85, plot=False)
    acc = R2.get_recovery_estimate(final_sequence2, dump=False, conv=False)
    
    return acc

# make p and [w1, w2] global variables
def objective(input):
    [pf0, pf1, ef0, ef1, lf] = input
    # USING GLOBAL VALUES
    #p_weight = 0.3
    #e_weight = 0.5
    #l_weight = 0.1
    #sys_weight = 0.1
    # negative because we are trying to maximize
    pf0_step = pf0 * -15.0  # multiply by lower bound to feed in as step
    pf1_step = pf1 * -15.0 

    #f = system_accuracy(p, [w1, w2], pf0_step, pf1_step, ef0, ef1, lf) - p_weight*(pf0 + pf1) - e_weight*(ef0 + ef1) - l_weight*lf
    
    if not(pf0 > 0 and pf0 < 1.0) or not(pf1 > 0 and pf1 < 1.0) or not(ef0 > 0.1 and ef0 < 0.5) or    not(ef1 > 0.1 and ef1 < 0.5) or not(lf > 0.1 and lf < 1.0):
        #print "------Out of Bounds------"
        #print "pf0", pf0
        #print "pf1", pf1
        #print "pf0 step", pf0_step
        #print "pf1 step", pf1_step
        #print "ef0", ef0
        #print "ef1", ef1
        #print "lf", lf
        #print "-------------------------"
        a_really_high_number = 10000000
        return a_really_high_number
    
    else:
        f1 = sys_weight*(system_accuracy(p, [w1, w2], pf0_step, pf1_step, ef0, ef1, lf))
        f2 = p_weight*(pf0 + pf1)
        f3 = e_weight*(ef0 + ef1)
        f4 = l_weight*(lf)
        f = f1 + f2 - f3 - f4
        
        #print "----Correct----"
        #print "f: ", -1.0 * f
        #print "data accuracy: ", f1
        #print "pitch weight: ", f2
        #print "energy weight: ", f3
        #print "pf0 step", pf0_step
        #print "pf1 step", pf1_step
        #print "---------------"
        return -1.0 * f


In [3]:
#lower and upper bound for variables pitch factor, energy factor, length factor
bounds=[ [0,1.0], [0,1.0],[0.1,0.5], [0.1,0.5], [0.1,1.0]]

#construct the bounds in the form of constraints
cons = []
for factor in range(len(bounds)):
    lower, upper = bounds[factor]
    l = {'type': 'ineq',
         'fun': lambda x, lb=lower, i=factor: x[i] - lb}
    u = {'type': 'ineq',
         'fun': lambda x, ub=upper, i=factor: ub - x[i]}
    cons.append(l)
    cons.append(u)
    
print "constraints: ", cons
    
initial_val = [0.3, 0.3, 0.3, 0.3, 0.3]

# sample cover speech and code book waveforms
#p = "audio_samples/Harvard_Sentences/OSR_us_000_0032_8k.wav"
p_list = ["audio_samples/woman2_orig.wav", "audio_samples/man2_orig.wav"]
w1 = "speech_samples/pronunciation_en_zero2.mp3"
w2 = "speech_samples/pronunciation_en_one.mp3"

params_dict = {}
# acc, pitch, energyl length
weights_list = [[0.7,0.1, 0.1, 0.1], [0.5,0.1, 0.3, 0.1], [0.3,0.2, 0.4, 0.1], [0.3,0.3, 0.3, 0.1], [0.1,0.4, 0.1, 0.4], [0.9,0.0, 0.1, 0.0]]
#weights_list = [[0.9, 0.0, 0.1, 0.0]]

for i, p in enumerate(p_list):
    for sys_weight, p_weight, e_weight, l_weight in weights_list:
    
        #opt = scipy.optimize.minimize(objective, initial_val, constraints=cons, tol=None, method="COBYLA", options={'disp': True, 'rhobeg': 0.1})
        opt = scipy.optimize.minimize(objective, initial_val, constraints=cons, tol=None, method="Powell", options={'disp': True, 'xatol': 0.05, 'fatol': 0.1})
        params_dict[(sys_weight, p_weight, e_weight, l_weight)] = opt
        print opt
    
    pickle.dump( params_dict, open( "powell_results_params_" + str(i) + ".pkl", "wb" ))

constraints:  [{'fun': <function <lambda> at 0x7fc9a1f94578>, 'type': 'ineq'}, {'fun': <function <lambda> at 0x7fc9a1f942a8>, 'type': 'ineq'}, {'fun': <function <lambda> at 0x7fc9a1f947d0>, 'type': 'ineq'}, {'fun': <function <lambda> at 0x7fc9a1f94758>, 'type': 'ineq'}, {'fun': <function <lambda> at 0x7fc9a1f94848>, 'type': 'ineq'}, {'fun': <function <lambda> at 0x7fc9a1f948c0>, 'type': 'ineq'}, {'fun': <function <lambda> at 0x7fc9a1f94938>, 'type': 'ineq'}, {'fun': <function <lambda> at 0x7fc9a1f949b0>, 'type': 'ineq'}, {'fun': <function <lambda> at 0x7fc9a1f94a28>, 'type': 'ineq'}, {'fun': <function <lambda> at 0x7fc9a1f94aa0>, 'type': 'ineq'}]




Optimization terminated successfully.
         Current function value: -0.778743
         Iterations: 6
         Function evaluations: 886
   direc: array([[ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.]])
     fun: -0.77874331237473837
 message: 'Optimization terminated successfully.'
    nfev: 886
     nit: 6
  status: 0
 success: True
       x: array([ 0.99981352,  0.74077554,  0.11850119,  0.2748595 ,  0.55979525])
Optimization terminated successfully.
         Current function value: -0.371660
         Iterations: 4
         Function evaluations: 911
   direc: array([[ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.]])
     fun: -0.3716595998782587
 message: 'Optimization terminated successfully.'
    nfev: 911
     nit: 4
  status: 0
 success: True
       x: 

# Check accuracy on dataset with optimized parameters!

In [16]:
def dataset_test(waveform_list):
    results = {'conv':{}, 'bit': {}}
    
    # male - 1, female - 0
    results_params = pickle.load(open("powell_results_params_1.pkl", "rb"))
    
    
    paths_to_source = ["audio_samples/man2_orig.wav"]#, "audio_samples/man2_orig.wav"]
    #base_path = "/audio_samples/Harvard_Sentences_Female/"
    #paths_to_source = [os.getcwd() + base_path + filename for filename in os.listdir(os.getcwd() + base_path)]
    
    w1 = waveform_list[0]
    w2 = waveform_list[1]
    

    for p in paths_to_source:
        print "Currently processing: ", p
        
        for ws in results_params.keys():
            [p0, p1, e0, e1, l] = results_params[ws]['x']
            print p0, p1, e0, e1, l

            E2 = Embed(p, [w1, w2], [0,1], [0,1,0,1,0])

            # Fix the truncation and energy values
            E2.truncate(l, idx_list=[0,1])
            E2.energy(e0, idx_list=[0])
            E2.energy(e1, idx_list=[1])
            E2.pitch_shift(p0 * -15.0, idx_list=[0])
            E2.pitch_shift(p1 * -15.0, idx_list=[1])

            embed2, num_total_digits = E2.get_embedded_audio(plot=False)
            d_embed2, sr = compress_and_decompress(embed2, "compression_samples/", plot=False)

            # get the timeseries of the the original waveforms and recover
            wf = E2.get_data_timeseries()
            R2 = Recover(d_embed2, wf, [0,1], [0,1,0,1,0], num_total_digits)
            final_sequence2 = R2.get_bit_sequence(thres=0.85, plot=False)
            bit_acc = R2.get_recovery_estimate(final_sequence2, conv=False)
            conv_acc = R2.get_recovery_estimate(final_sequence2, conv=True)

            # results metrics per speech sample
            metadata = str(p)
            try:
                results['conv'][ws] += conv_acc / float(len(paths_to_source))
                results['bit'][ws] += bit_acc / float(len(paths_to_source))
            except:
                results['conv'][ws] = conv_acc / float(len(paths_to_source))
                results['bit'][ws] = bit_acc / float(len(paths_to_source))
                
                print ws, bit_acc
            
                    
    return results

In [17]:
results = dataset_test(["speech_samples/pronunciation_en_zero2.mp3", "speech_samples/pronunciation_en_one.mp3"])
#pickle.dump( results, open( "accuracy_test_female_powell_hvd.pkl", "wb" ))

Currently processing:  audio_samples/man2_orig.wav
0.8633943806 0.980249918027 0.119660113008 0.102438185375 0.299667874398
(0.3, 0.2, 0.4, 0.1) 0.736842105263
0.999742877221 0.30676300709 0.147685489557 0.15976302806 0.791666666679
(0.7, 0.1, 0.1, 0.1) 1.0
0.953008014149 0.935913547622 0.114335324624 0.181963481569 0.209791666674
(0.3, 0.3, 0.3, 0.1) 0.685185185185
0.987193785974 0.999000752599 0.135797954903 0.145951755354 0.104930555558
(0.1, 0.4, 0.1, 0.4) 0.642201834862
0.0543956962298 0.145038095087 0.1097668447 0.178089935057 0.915444725528
(0.9, 0.0, 0.1, 0.0) 1.0
0.963480103516 0.211761391271 0.115687126386 0.218707628659 0.490942028989
(0.5, 0.1, 0.3, 0.1) 0.95652173913
