In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

# Taken from the EDA notebook
from os import listdir
from os.path import isfile, join
def get_filenames(mypath):
    return [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [None]:
filenames = ["../data/audioset_v1_embeddings/bal_train/" + i for i in get_filenames("../data/audioset_v1_embeddings/bal_train/")]
train_dataset = tf.data.TFRecordDataset(filenames)

# Make Pandas DataFrame out of the training set records.
col = ['video_id', 'time_stamp'] + [str(k) for k in range(0,128)]

placeholder_array = np.array(col)

for raw_record in train_dataset:
    example = tf.train.SequenceExample()
    example.ParseFromString(raw_record.numpy())
    for i in range(0,len(example.feature_lists.feature_list['audio_embedding'].feature)):
        vID = example.context.feature['video_id'].bytes_list.value[0]
        time = example.context.feature['start_time_seconds'].float_list.value[0] + 0.96*i
        placeholder_array = np.vstack((placeholder_array, np.array([vID, time] + list(example.feature_lists.feature_list['audio_embedding'].feature[0].bytes_list.value[0]))))

print('Done making the array. Converting to Pandas DF and saving.')

trainFeatures = pd.DataFrame(np.delete(placeholder_array, 0, 0), columns = col)

trainFeatures.to_csv('trainFeatures.csv')
trainFeatures.head()

In [None]:
# Finally, make a Pandas DF for the target: whether or not speech is present in each 0.96 second chunk.
trainTargets = pd.read_csv('trainFeatures.csv', header = 0, index_col = 0).iloc[:,0:2]
trainTargets['speech_present'] = False # By default.
print(trainTargets.head())

#trainTargets = trainFeatures[:,:'time_stamp']
#del TrainFeatures # For RAM's sake. We've already saved this to a CSV.

# Make a Pandas DataFrame of all instances of speech present.
trainEvents = pd.read_csv("../data/audioset_train_strong.tsv", sep="\t")
print(trainEvents.head())

''' Have to make the labels match the feature set,
    and these labels have a "_" followed by trailing digits.'''
trainEvents.iloc[:,0] = trainEvents.iloc[:,0].str.rstrip("0123456789")
trainEvents.iloc[:,0] = trainEvents.iloc[:,0].str.rstrip("_")
print(trainEvents.head())

#for i in range(0,len(trainEvents)):
#    trainEvents.iloc[i,0] = trainEvents.iloc[i,0].rstrip("0123456789")
#    trainEvents.iloc[i,0] = trainEvents.iloc[i,0].rstrip("_")
#    if (trainEvents.iloc[i,3] in speech_events) == False:
#        trainEvents.iloc[i,3] = None
#trainEvents = trainEvents.dropna() # Deletes all rows with a None in it, i.e. entries that have no speech.

# Respectively, speech, male speech, female speech, child speech, conversation, and narration.
speech_events = set(['/m/09x0r', '/m/05zppz','/m/02zsn','/m/0ytgt','/m/01h8n0','/m/02qldy'])

# Now check to see if each 0.96 second segment contains speech according to the trainEvents DF.
'''This seems complicated at first glance, but the idea behind it is simple.
Since each clip's events are grouped together in the trainEvents TSV, we just
need to run a search ONCE for each clip label. Once we have it, we don't need
to search again for the next entry's events unless its label is different.
Sadly, the labels are NOT in alphabetical order, making an approach like this
necessary.'''

# Apparently the .96 second annotations were only applied to 5% of the overall dataset.
# There are elements of the balanced dataset that are missing...
no_match = np.array(None) 

first_label_match = 0
for i in range(0,len(trainTargets)):
    if first_label_match == 0:
        while (trainTargets.iloc[i,0] != trainEvents.iloc[first_label_match,0]) and (first_label_match <= len(trainTargets)):
            first_label_match += 1
            if first_label_match == len(trainTargets):
                print(trainTargets.iloc[i,0] + ' has no match.')
                no_match = np.append(no_match, [trainTargets.iloc[i,0]])
    offset = 0
    while (trainTargets.iloc[i,0] == trainEvents.iloc[first_label_match + offset,0]):
        if trainTargets.iloc[i,1] <= trainEvents.iloc[first_label_match + offset,1]:
            if trainTargets.iloc[i,1] + 0.96 >= trainEvents.iloc[first_label_match + offset,1]:
                trainTargets.iloc[i,2] = True
        if trainTargets.iloc[i,1] >= trainEvents.iloc[first_label_match + offset,1]:
            if trainTargets.iloc[i,1] <= trainEvents.iloc[first_label_match + offset,2]:
                trainTargets.iloc[i,2] = True
        offset += 1
    if i != len(trainTargets) - 1:
        if trainTargets.iloc[i,0] != trainTargets.iloc[i+1,0]:
            first_label_match = 0

trainTargets.to_csv('trainTargets.csv')

np.savetext('missing_segments.csv', no_match, delimiter=',')
# When done, we can drop the labels and time indices, since the orders are the same.

      video_id  time_stamp  speech_present
0  --cB2ZVjpnA       30.00           False
1  --cB2ZVjpnA       30.96           False
2  --cB2ZVjpnA       31.92           False
3  --cB2ZVjpnA       32.88           False
4  --cB2ZVjpnA       33.84           False
          segment_id  start_time_seconds  end_time_seconds       label
0  b0RFKhbpFJA_30000               0.000            10.000  /m/03m9d0z
1  b0RFKhbpFJA_30000               4.753             5.720   /m/05zppz
2  b0RFKhbpFJA_30000               0.000            10.000  /m/07pjwq1
3  b0RFKhbpFJA_30000               6.899             7.010  /m/07qjznt
4  b0RFKhbpFJA_30000               8.534             9.156  /t/dd00092
    segment_id  start_time_seconds  end_time_seconds       label
0  b0RFKhbpFJA               0.000            10.000  /m/03m9d0z
1  b0RFKhbpFJA               4.753             5.720   /m/05zppz
2  b0RFKhbpFJA               0.000            10.000  /m/07pjwq1
3  b0RFKhbpFJA               6.899             7.010  /m

00mE-lhe_R8 has no match.
01hjVJN9xCg has no match.
01k1v-NgjWs has no match.
01xpKyI0rXA has no match.
01bTS8O2Xnc has no match.
01q8wKX4XEQ has no match.
01B907_Gyys has no match.
0150dZu3Na8 has no match.
01PzcPKT3_E has no match.
0298WjE3_tk has no match.
02Qntw26enM has no match.
021B2ozAjNg has no match.
02z_7DUC_74 has no match.
02tl_ek8f9I has no match.
02rLonMSO9k has no match.
02aq42RDm2Q has no match.
02pl9PaQwmI has no match.
02QXbqf9bbw has no match.
02JEvDAcq7U has no match.
02R_w3cr1i4 has no match.
02n2I1JHq4U has no match.
02q5C77_xeQ has no match.
032FsMMKWqM has no match.
03kFLTZguBs has no match.
03NLMEMi8-I has no match.
03EmZCCkf-E has no match.
03lAXcmqsMU has no match.
03frQGyrgQ4 has no match.
03z0rpIkm5g has no match.
04SFLoexHDo has no match.
046ynbCwM90 has no match.
04RDcHpgzTM has no match.
04JKX_vlePE has no match.
05H3CBB2oiI has no match.
058Nv060Fz4 has no match.
05OJDYeHLMc has no match.
05wEdbchyjM has no match.
06Brdf83RZE has no match.
06J19koRu_4 

194N_h5CGzA has no match.
19Pp9QEw17U has no match.
19DRe0cdeXQ has no match.
19tPF3TY3g0 has no match.
19-GI2LzOtc has no match.
192AgE8E-cw has no match.
1aiMcQqOIO4 has no match.
1aLTEOXlRzU has no match.
1a6-xAO5Dik has no match.
1a6ziKisD4Q has no match.
1aVhpF7Sm8o has no match.
1aNc3AkgJNU has no match.
1aulP-srFAI has no match.
1alQZjvWr38 has no match.
1aavDifdIAk has no match.
1aEh44p0EqY has no match.
1bsXgn_25VQ has no match.
1bOukvpECmU has no match.
1bet-VrOUlM has no match.
1blEjTegTzY has no match.
1caCiNiUErA has no match.
1c5rSQsqmBE has no match.
1c7NBk1x3y0 has no match.
1cFHCB6bR8I has no match.
1cS0oGvV5PY has no match.
1c8yp4_Fji8 has no match.
1cjduhFyTkA has no match.
1cew7dAbDh0 has no match.
1DvlbESnFjo has no match.
1DiSaLjlU80 has no match.
1DH7AhyL0KU has no match.
1DxLwZhTj0A has no match.
1eQSkR2w0R8 has no match.
1eW-T6vtO7k has no match.
1e3_ucOz0Ik has no match.
1ei2Ha64te4 has no match.
1ecvyHKH9TI has no match.
1et9yT2BYD0 has no match.
1ejb5TtkhY8 

2IWV8hlUydk has no match.
2I9ToRyfL5k has no match.
2juN7xQerfw has no match.
2jraQyyG0qY has no match.
2jW4P4mGxo8 has no match.
2j8hxco6pBk has no match.
2jFE-MKRmPw has no match.
2je_d-3I0OY has no match.
2jIxHKpgBys has no match.
2K61zdPjAi8 has no match.
2KgX6IvEVrc has no match.
2K5j6V0vsvM has no match.
2KkNk9Ao7G4 has no match.
2KwwQHit0-Q has no match.
2KSST-I4Fj8 has no match.
2KK9MldzvzE has no match.
2K8O1KQQw54 has no match.
2lBZ6yPW9WU has no match.
2lXyUsQZ5Bo has no match.
2mOwAL8_vog has no match.
2mXZfaHadmU has no match.
2mEH2FUWUTg has no match.
2mbrbHlsBgk has no match.
2mF3jZwPXSc has no match.
2mJbGx5D-zA has no match.
2mBef9KrCpk has no match.
2mAbc1JnXXY has no match.
2nd27vVGg8k has no match.
2nADZL8olrg has no match.
2n13-18I_nM has no match.
2neeSM0lvbs has no match.
2nsUnuIkzDE has no match.
2nS_rk1118M has no match.
2nbyIEmQAaU has no match.
2Og-RS5LXno has no match.
2O22edWULCE has no match.
2O4TajdF_Cc has no match.
2OROEX0O-kw has no match.
2OEm9kmVgIc 

In [None]:
# Do the same for the eval DS...
filenames = ["../data/audioset_v1_embeddings/eval/" + i for i in get_filenames("../data/audioset_v1_embeddings/bal_train/")]
eval_dataset = tf.data.TFRecordDataset(filenames)


In [None]:
# This was copy-pasted from my old personal project.
# Some of the layers are not applicable (normalization),
# and some layers (input, reshape, resizing, and dense) need to be resized.
# shape = (,), (leftover from the input layer)

model = Sequential([
    layers.Input(ragged=True),
    layers.Conv2D(4, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(4, 3, activation='relu'),
    layers.Flatten(),
    layers.Dense(200, activation='relu'),
    layers.Dense(160, activation='relu'),
    layers.Dense(120, activation='relu'),
    layers.Dense(80, activation='relu'),
    layers.Dense(37, activation='relu')
])

model.summary()

In [None]:
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mean_squared_error'])
model.fit(trainFeatures,trainTargets,epochs = 100, batch_size = 4)
if save_to_disk:
    # Saving model to JSON and weights to H5.
    os.chdir('..')
    os.chdir('./models/')
    model_json = model.to_json()
    with open("model.json", "w") as json_file:
        json_file.write(model_json)
    model.save_weights("model.h5")
    print("Saved model to disk")
loss  = model.evaluate(ValFeatures, ValTargets)
print('Loss on Validation Set: ', loss)