In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

# Taken from the EDA notebook
from os import listdir
from os.path import isfile, join
def get_filenames(mypath):
    return [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [2]:
filenames = ["../data/audioset_v1_embeddings/bal_train/" + i for i in get_filenames("../data/audioset_v1_embeddings/bal_train/")]
train_dataset = tf.data.TFRecordDataset(filenames)

# Make Pandas DataFrame out of the training set records.
col = ['video_id', 'time_stamp'] + [str(k) for k in range(0,128)]

placeholder_array = np.array(col)

for raw_record in train_dataset:
    example = tf.train.SequenceExample()
    example.ParseFromString(raw_record.numpy())
    for i in range(0,len(example.feature_lists.feature_list['audio_embedding'].feature)):
        vID = example.context.feature['video_id'].bytes_list.value[0]
        time = example.context.feature['start_time_seconds'].float_list.value[0] + 0.96*i
        placeholder_array = np.vstack((placeholder_array, np.array([vID, time] + list(example.feature_lists.feature_list['audio_embedding'].feature[i].bytes_list.value[0]))))

print('Done making the array. Converting to Pandas DF and saving.')

trainFeatures = pd.DataFrame(np.delete(placeholder_array, 0, 0), columns = col)

trainFeatures.to_csv('trainFeatures.csv')
trainFeatures.head()

Done making the array. Converting to Pandas DF and saving.


Unnamed: 0,video_id,time_stamp,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,--cB2ZVjpnA,30.0,95,122,147,14,101,48,157,125,...,185,0,186,0,63,255,255,2,190,0
1,--cB2ZVjpnA,30.96,123,109,137,33,145,126,43,90,...,56,157,92,139,0,26,84,123,140,248
2,--cB2ZVjpnA,31.92,52,135,152,7,62,112,121,83,...,0,0,129,35,238,255,255,0,255,0
3,--cB2ZVjpnA,32.88,53,167,104,0,50,147,174,62,...,136,0,0,202,90,66,255,241,255,0
4,--cB2ZVjpnA,33.84,26,220,90,17,0,98,66,34,...,255,0,178,0,22,255,255,255,180,0


In [39]:
# Finally, make a Pandas DF for the target: whether or not speech is present in each 0.96 second chunk.
trainTargets = pd.read_csv('trainFeatures.csv', header = 0, index_col = 0).iloc[:,0:2]
trainTargets['speech_present'] = False # By default.
print(trainTargets.head())

#trainTargets = trainFeatures[:,:'time_stamp']
#del TrainFeatures # For RAM's sake. We've already saved this to a CSV.

# Make a Pandas DataFrame of all instances of speech present.
trainEvents = pd.read_csv("../data/audioset_train_strong.tsv", sep="\t")
print(trainEvents.head())

''' Have to make the labels match the feature set,
    and these labels have a "_" followed by trailing digits.'''
trainEvents.iloc[:,0] = trainEvents.iloc[:,0].str.rstrip("0123456789")
trainEvents.iloc[:,0] = trainEvents.iloc[:,0].str.rstrip("_")
print(trainEvents.head())

filter = set(trainTargets.iloc[:,0]).intersection(set(trainEvents.iloc[:,0]))
print(filter)

for i in range(0,len(trainTargets)):
    if (trainTargets.iloc[i,0] in filter) == False:
        trainTargets.iloc[i,0] = None
trainTargets = trainTargets.dropna() # Deletes all rows with a None in it, i.e. entries not in the intersection.

trainTargets.to_csv('trainTargetsFiltered.csv')

      video_id  time_stamp  speech_present
0  --cB2ZVjpnA       30.00           False
1  --cB2ZVjpnA       30.96           False
2  --cB2ZVjpnA       31.92           False
3  --cB2ZVjpnA       32.88           False
4  --cB2ZVjpnA       33.84           False
          segment_id  start_time_seconds  end_time_seconds       label
0  b0RFKhbpFJA_30000               0.000            10.000  /m/03m9d0z
1  b0RFKhbpFJA_30000               4.753             5.720   /m/05zppz
2  b0RFKhbpFJA_30000               0.000            10.000  /m/07pjwq1
3  b0RFKhbpFJA_30000               6.899             7.010  /m/07qjznt
4  b0RFKhbpFJA_30000               8.534             9.156  /t/dd00092
    segment_id  start_time_seconds  end_time_seconds       label
0  b0RFKhbpFJA               0.000            10.000  /m/03m9d0z
1  b0RFKhbpFJA               4.753             5.720   /m/05zppz
2  b0RFKhbpFJA               0.000            10.000  /m/07pjwq1
3  b0RFKhbpFJA               6.899             7.010  /m

KeyboardInterrupt: 

In [43]:
print('Current size of trainEvents is:' + str(len(trainEvents)))
for i in range(0,len(trainEvents)):
    if (trainEvents.iloc[i,0] in filter) == False:
        trainEvents.iloc[i,0] = None
trainEvents = trainEvents.dropna() # Deletes all rows with a None in it, i.e. entries not in the intersection.

trainEvents.to_csv('trainEventsFiltered.csv')

trainFeatures0 = pd.read_csv('trainFeatures.csv', header = 0, index_col = 0)
for i in range(0,len(trainFeatures0)):
    if (trainFeatures0.iloc[i,0] in filter) == False:
        trainFeatures0.iloc[i,0] = None
trainFeatures0 = trainFeatures0.dropna() # Deletes all rows with a None in it, i.e. entries not in the intersection.

trainFeatures0.to_csv('trainFeaturesFiltered.csv')

Current size of trainEvents is:474570


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [60]:
trainTargets = pd.read_csv('../data/trainTargetsFiltered_unfinished.csv', header = 0, index_col = 0)
trainEvents = pd.read_csv('../data/trainEventsFiltered.csv', header = 0, index_col = 0)
print(trainTargets.head())
print(trainEvents.head())

# Respectively, speech, male speech, female speech, child speech, conversation, narration, babbling, synthetic speech.
speech_events = set(['/m/09x0r', '/m/05zppz','/m/02zsn','/m/0ytgt','/m/01h8n0','/m/02qldy','/m/0261r1','/m/0brhx'])

# Now check to see if each 0.96 second segment contains speech according to the trainEvents DF.
'''This seems complicated at first glance, but the idea behind it is simple.
Since each clip's events are grouped together in the trainEvents TSV, we just
need to run a search ONCE for each clip label. Once we have it, we don't need
to search again for the next entry's events unless its label is different.
Sadly, the labels are NOT in alphabetical order, making an approach like this
necessary.'''

for i in range(0,len(trainTargets)):
    if i == 0 or trainTargets.iloc[i,0] != trainTargets.iloc[i-1,0]:
        clipset = trainEvents.loc[trainEvents['segment_id'] == trainTargets.iloc[i,0]]
        clip_start_time = trainTargets.iloc[i,1]
        clipset.loc[:,'start_time_seconds'] = clipset.loc[:,'start_time_seconds'] + clip_start_time
        clipset.loc[:,'end_time_seconds'] = clipset.loc[:,'end_time_seconds'] + clip_start_time
    for j in range(0,len(clipset)):
        if clipset.iloc[j,3] in speech_events:
            if trainTargets.iloc[i,1] <= clipset.iloc[j,1]:
                if trainTargets.iloc[i,1] + 0.96 >= clipset.iloc[j,1]:
                    trainTargets.iloc[i,2] = True
            if trainTargets.iloc[i,1] >= trainEvents.iloc[j,1]:
                if trainTargets.iloc[i,1] <= trainEvents.iloc[j,2]:
                    trainTargets.iloc[i,2] = True

trainTargets.to_csv('../data/trainTargets.csv')
# When done, we can drop the labels and time indices, since the orders are the same.

       video_id  time_stamp  speech_present
10  --PJHxphWEs       30.00           False
11  --PJHxphWEs       30.96           False
12  --PJHxphWEs       31.92           False
13  --PJHxphWEs       32.88           False
14  --PJHxphWEs       33.84           False
     segment_id  start_time_seconds  end_time_seconds      label
24  O35jXasNYxc               0.000             0.381  /m/0dgw9r
25  O35jXasNYxc               0.000            10.000  /m/093_4n
26  O35jXasNYxc               0.733             1.578  /m/01b_21
27  O35jXasNYxc               1.683             2.094  /m/01b_21
28  O35jXasNYxc               2.191             2.565  /m/01b_21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [63]:
# Do the same for the eval DS...
filenames = ["../data/audioset_v1_embeddings/eval/" + i for i in get_filenames("../data/audioset_v1_embeddings/eval/")]
eval_dataset = tf.data.TFRecordDataset(filenames)
print('Dataset loaded.')

# Make Pandas DataFrame out of the training set records.
col = ['video_id', 'time_stamp'] + [str(k) for k in range(0,128)]

placeholder_array = np.array(col)

for raw_record in eval_dataset:
    example = tf.train.SequenceExample()
    example.ParseFromString(raw_record.numpy())
    for i in range(0,len(example.feature_lists.feature_list['audio_embedding'].feature)):
        vID = example.context.feature['video_id'].bytes_list.value[0]
        time = example.context.feature['start_time_seconds'].float_list.value[0] + 0.96*i
        placeholder_array = np.vstack((placeholder_array, np.array([vID, time] + list(example.feature_lists.feature_list['audio_embedding'].feature[0].bytes_list.value[0]))))

print('Done making the array. Converting to Pandas DF and saving.')

evalFeatures = pd.DataFrame(np.delete(placeholder_array, 0, 0), columns = col)

evalFeatures.to_csv('../data/evalFeatures.csv')
print('evalFeatures:')
print(evalFeatures.head())


# Make a Pandas DataFrame of all instances of speech present.
evalEvents = pd.read_csv("../data/audioset_eval_strong_framed_posneg.tsv", sep="\t")

''' Have to make the labels match the feature set,
    and these labels have a "_" followed by trailing digits.'''
evalEvents.iloc[:,0] = evalEvents.iloc[:,0].str.rstrip("0123456789")
evalEvents.iloc[:,0] = evalEvents.iloc[:,0].str.rstrip("_")
print('evalEvents:')
print(evalEvents.head())



filter = set(evalFeatures.iloc[:,0]).intersection(set(evalEvents.iloc[:,0]))
print('Intersection has ' + str(len(filter)) + ' segments.')

for i in range(0,len(evalFeatures)):
    if (evalFeatures.iloc[i,0] in filter) == False:
        evalFeatures.iloc[i,0] = None
evalFeatures = evalFeatures.dropna() # Deletes all rows with a None in it, i.e. entries not in the intersection.
evalFeatures.to_csv('../data/evalFeaturesFiltered.csv')

for i in range(0,len(evalEvents)):
    if (evalEvents.iloc[i,0] in filter) == False:
        evalEvents.iloc[i,0] = None
evalEvents = evalEvents.dropna() # Deletes all rows with a None in it, i.e. entries not in the intersection.
evalEvents.to_csv('../data/evalEventsFiltered.csv')



# Finally, make a Pandas DF for the target: whether or not speech is present in each 0.96 second chunk.
evalTargets = pd.read_csv('../data/evalFeaturesFiltered.csv', header = 0, index_col = 0).iloc[:,0:2]
evalTargets['speech_present'] = False # By default.
print('evalTargets before checking:')
print(evalTargets.head())

# Respectively, speech, male speech, female speech, child speech, conversation, narration, babbling, synthetic speech.
speech_events = set(['/m/09x0r', '/m/05zppz','/m/02zsn','/m/0ytgt','/m/01h8n0','/m/02qldy','/m/0261r1','/m/0brhx'])

# Now check to see if each 0.96 second segment contains speech according to the evalEvents DF.
'''This seems complicated at first glance, but the idea behind it is simple.
Since each clip's events are grouped together in the trainEvents TSV, we just
need to run a search ONCE for each clip label. Once we have it, we don't need
to search again for the next entry's events unless its label is different.
Sadly, the labels are NOT in alphabetical order, making an approach like this
necessary.'''

for i in range(0,len(evalTargets)):
    if i == 0 or evalTargets.iloc[i,0] != evalTargets.iloc[i-1,0]:
        clipset = evalEvents.loc[evalEvents['segment_id'] == evalTargets.iloc[i,0]]
        clip_start_time = evalTargets.iloc[i,1]
        clipset.loc[:,'start_time_seconds'] = clipset.loc[:,'start_time_seconds'] + clip_start_time
        clipset.loc[:,'end_time_seconds'] = clipset.loc[:,'end_time_seconds'] + clip_start_time
    for j in range(0,len(clipset)):
        if clipset.iloc[j,3] in speech_events and clipset.iloc[j,4] == 'PRESENT':
            if evalTargets.iloc[i,1] == clipset.iloc[j,1]:
                evalTargets.iloc[i,2] = True

evalTargets.to_csv('../data/evalTargetsFiltered.csv')
print('evalTargets after checking:')
print(evalTargets.head())
# When done, we can drop the labels and time indices, since the orders are the same.

Dataset loaded.
Done making the array. Converting to Pandas DF and saving.
evalFeatures:
      video_id time_stamp    0   1    2    3    4   5   6    7  ...  118  119  \
0  --4gqARaEJE        0.0  131  35  130  105  144  58  36  170  ...  185  157   
1  --4gqARaEJE       0.96  131  35  130  105  144  58  36  170  ...  185  157   
2  --4gqARaEJE       1.92  131  35  130  105  144  58  36  170  ...  185  157   
3  --4gqARaEJE       2.88  131  35  130  105  144  58  36  170  ...  185  157   
4  --4gqARaEJE       3.84  131  35  130  105  144  58  36  170  ...  185  157   

   120 121  122  123  124  125  126 127  
0  134   0  151  166  255  142  147  37  
1  134   0  151  166  255  142  147  37  
2  134   0  151  166  255  142  147  37  
3  134   0  151  166  255  142  147  37  
4  134   0  151  166  255  142  147  37  

[5 rows x 130 columns]
evalEvents:
    segment_id  start_time_seconds  end_time_seconds       label      present
0  YxlGt805lTA                 0.0              0.96    /m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


evalTargets after checking:
      video_id  time_stamp  speech_present
0  --4gqARaEJE        0.00           False
1  --4gqARaEJE        0.96           False
2  --4gqARaEJE        1.92           False
3  --4gqARaEJE        2.88           False
4  --4gqARaEJE        3.84           False
