In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from PIL import Image
import gc
import matplotlib.pyplot as plt

In [2]:
data_dir = "../input/seti-breakthrough-listen/"

In [3]:
train_df = pd.read_csv("../input/seti-breakthrough-listen/train_labels.csv")

In [4]:
train_df.shape[0]

60000

In [5]:
train_df.groupby(by='target').size()/train_df.shape[0]

target
0    0.9
1    0.1
dtype: float64

In [6]:
train_df.head()

Unnamed: 0,id,target
0,0000799a2b2c42d,0
1,00042890562ff68,0
2,0005364cdcb8e5b,0
3,0007a5a46901c56,0
4,0009283e145448e,0


In [7]:
def get_path_for(path='train'):
    array_path_train_index = []
    train_dict = []
    subdirs = [subdir for subdir in os.listdir(data_dir+'/test')]
    print(subdirs)
    print(len(subdirs))
    for index in subdirs:
        for array in os.listdir(data_dir+'/%s/'%(path)+str(index)):
            train_dict.append((array.split('.')[0], data_dir+'%s/'%(path)+str(index)+'/'+array))
            array_path_train_index.append(array.split('.')[0])

    return dict(train_dict), np.array(array_path_train_index)

In [8]:
train_dict, array_path_train_index = get_path_for('train')
test_dict, test_path_index = get_path_for('test')

['7', '2', 'b', 'f', '5', 'e', '8', '0', 'a', '3', '1', 'c', '4', '9', '6', 'd']
16
['7', '2', 'b', 'f', '5', 'e', '8', '0', 'a', '3', '1', 'c', '4', '9', '6', 'd']
16


In [9]:
test_path = np.array([v for k, v in test_dict.items()])

In [10]:
assert test_path.shape[0] == 39995

In [11]:
train_df['path'] = train_df[train_df.id\
                            .isin(array_path_train_index)]\
                            .apply(lambda row: train_dict[row.id] if row.id in train_dict.keys() else np.nan, axis=1)

In [12]:
train_df.shape[0]

60000

In [13]:
train_df.groupby(by='target').size()/train_df.shape[0]

target
0    0.9
1    0.1
dtype: float64

In [14]:
train_df.sample(n=5)

Unnamed: 0,id,target,path
54494,e82be10db727e2c,0,../input/seti-breakthrough-listen/train/e/e82b...
30452,821cb09c2cc0663,1,../input/seti-breakthrough-listen/train/8/821c...
30104,809cfcb7d0890c5,0,../input/seti-breakthrough-listen/train/8/809c...
2437,0a441f600a24bba,0,../input/seti-breakthrough-listen/train/0/0a44...
45844,c36989c16d97094,0,../input/seti-breakthrough-listen/train/c/c369...


In [15]:
import tensorflow as tf

In [16]:
def load_batches_of_batches(df, start_index=0, nro_batches=1, batch_size=32):
    batch = []
    arrays = []
    end = batch_size * nro_batches
    for index in range(start_index, end):
        batch.append(np.load(df.iloc[index].path))
        if (index+1)%batch_size==0:
            arrays.append(np.stack(batch))
            batch = []
    return np.array(arrays, dtype='float32')

In [17]:
#numpy array

def load_batches(array, start_index=0, end_index=32):
    batch = []
    for index in range(start_index, end_index):
        batch.append(np.load(array[index]).reshape(6, 273, 256, 1))
    return np.array(np.stack(batch), dtype='float32')

In [18]:
test_batches = load_batches_of_batches(train_df, 0, 16, 4)

print(test_batches.shape)
assert test_batches.shape == (16, 4, 6, 273, 256)

(16, 4, 6, 273, 256)


In [19]:
test_batch = load_batches(train_df['path'].to_numpy(), 0, 16)

print(test_batch.shape)
assert test_batch.shape == (16, 6, 273, 256, 1)

(16, 6, 273, 256, 1)


In [20]:
# [batch_size, T, W, H]
X_input = tf.keras.layers.Input(shape=(6, 273, 256, 1))

X = tf.keras.layers.TimeDistributed(tf.keras.layers.BatchNormalization())(X_input)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(64, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(64, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(X)
X = tf.keras.layers.TimeDistributed(tf.keras.layers.BatchNormalization())(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(128, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(128, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(X)
X = tf.keras.layers.TimeDistributed(tf.keras.layers.BatchNormalization())(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(256, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(256, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(256, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(X)
X = tf.keras.layers.TimeDistributed(tf.keras.layers.BatchNormalization())(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(512, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(512, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(512, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(X)
X = tf.keras.layers.TimeDistributed(tf.keras.layers.BatchNormalization())(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(512, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(512, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(512, (3, 3),
                                                           padding='valid',
                                                           data_format='channels_last',
                                                          activation='relu'))(X)

# [batch_size, filters, W, H]
X = tf.keras.layers.TimeDistributed(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(X)
X = tf.keras.layers.TimeDistributed(tf.keras.layers.BatchNormalization())(X)
X = tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten())(X)

X = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(4096, activation='relu'))(X)
X = tf.keras.layers.TimeDistributed(tf.keras.layers.Dropout(0.4))(X)
X = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(4096, activation='relu'))(X)
X = tf.keras.layers.TimeDistributed(tf.keras.layers.Dropout(0.4))(X)

X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(6, return_sequences=True))(X)
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(6, return_sequences=True))(X)
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(6, return_sequences=True))(X)
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(6, return_sequences=True))(X)
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(6))(X)

X = tf.keras.layers.Dense(1024, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)

X_output = tf.keras.layers.Dense(1, activation='sigmoid')(X)

model = tf.keras.Model(X_input, X_output)

In [21]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
            optimizer="Adam",
            metrics=[tf.keras.metrics.BinaryAccuracy(),
                     tf.keras.metrics.FalseNegatives(),
                     tf.keras.metrics.FalsePositives(),
                     tf.keras.metrics.Precision(),
                     tf.keras.metrics.Recall(),
                     tf.keras.metrics.AUC()])

In [22]:
#tags to numpy

tags = train_df.pop('target').to_numpy()
train_paths = train_df.pop('path').to_numpy()

In [23]:
def split_train_val(batches, tags, split=0.3):
    mask = np.array([np.random.random(1)[0]>split for index in range(batches.shape[0])], dtype='bool')
    train_batch = batches[mask]
    train_tag = tags[mask]
    val_batch = batches[~mask]
    val_tag = tags[~mask]
    
    return train_batch, train_tag, val_batch, val_tag

In [24]:
train_batch, train_tag, val_batch, val_tag = split_train_val(test_batch, tags[:16], 0.3)
print(train_batch.shape)
print(train_tag.shape)
print(val_batch.shape)
print(val_tag.shape)

(12, 6, 273, 256, 1)
(12,)
(4, 6, 273, 256, 1)
(4,)


In [25]:
del test_batch
del test_batches
del train_batch
del train_tag
del val_batch
del val_tag
gc.collect()

28929

In [26]:
#test balance weights

from sklearn.utils import class_weight

class_weights_0, class_weights_1 = class_weight.compute_class_weight('balanced', np.unique(tags[:512]), classes=[0, 1], y=tags[:512])

print(np.sum(tags[:512]))
print(tags[:512].shape[0]-np.sum(tags[:512]))
print(np.sum(tags[:512])/tags[:512].shape[0])
print((class_weights_0, class_weights_1))

52
460
0.1015625
(0.5565217391304348, 4.923076923076923)




In [27]:
from sklearn.utils import shuffle

test = np.array(range(10))
test2 = np.array(range(15, 25))
print(test)
print(test2)

shuffle_test, shuffle_test2 = shuffle(test, test2)

print(shuffle_test)
print(shuffle_test2)

[0 1 2 3 4 5 6 7 8 9]
[15 16 17 18 19 20 21 22 23 24]
[9 4 2 5 3 1 8 0 6 7]
[24 19 17 20 18 16 23 15 21 22]


In [28]:
gc.collect()

40

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
batch_size = 1024+128
total = 0
next_index = 0

class_weights_0, class_weights_1 = class_weight.compute_class_weight('balanced', np.unique(tags), classes=[0, 1], y=tags)

print("signal weight: [%f] - noiseweight: [%f]"%(class_weights_1, class_weights_0))
for epochs in range(5):
    gc.collect()
    train_paths, tags = shuffle(train_paths, tags)
    total = 0
    print("Epoch: [%i]"%(epochs))
    next_index = 0
    for index in range(0, train_paths.shape[0], batch_size):        
        next_index = index + batch_size if train_paths.shape[0] - next_index >= batch_size else train_paths.shape[0]
        
        batch_x = load_batches(train_paths, index, next_index)
        batch_y = tags[index: next_index]

#         train_x, train_y, val_x, val_y = split_train_val(batch_x, batch_y, 0.2)
        train_x, val_x, train_y, val_y = train_test_split(batch_x, batch_y, test_size=0.2)

        print("train size: [%i] val size: [%i]"%(train_x.shape[0], val_x.shape[0]))
        
        model.fit(x=train_x,
                y=train_y,
                validation_data=(val_x, val_y), 
                class_weight={0:class_weights_0, 1:class_weights_1},
                batch_size = 16,
                epochs=1)
        
        del batch_x
        del batch_y
        del train_x
        del train_y
        del val_x
        del val_y
        gc.collect()
        
        total += batch_size
        print("processed so far: [%.2f]"%((total/train_paths.shape[0])*100))

signal weight: [5.000000] - noiseweight: [0.555556]
Epoch: [0]
train size: [921] val size: [231]
processed so far: [1.92]
train size: [921] val size: [231]
processed so far: [3.84]
train size: [921] val size: [231]
processed so far: [5.76]
train size: [921] val size: [231]
processed so far: [7.68]
train size: [921] val size: [231]
processed so far: [9.60]
train size: [921] val size: [231]
processed so far: [11.52]
train size: [921] val size: [231]
processed so far: [13.44]
train size: [921] val size: [231]
processed so far: [15.36]
train size: [921] val size: [231]
processed so far: [17.28]
train size: [921] val size: [231]
processed so far: [19.20]
train size: [921] val size: [231]
processed so far: [21.12]
train size: [921] val size: [231]
processed so far: [23.04]
train size: [921] val size: [231]
processed so far: [24.96]
train size: [921] val size: [231]
processed so far: [26.88]
train size: [921] val size: [231]
processed so far: [28.80]
train size: [921] val size: [231]
processe

In [31]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 6, 273, 256, 1)]  0         
_________________________________________________________________
time_distributed (TimeDistri (None, 6, 273, 256, 1)    4         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 6, 271, 254, 64)   640       
_________________________________________________________________
time_distributed_2 (TimeDist (None, 6, 269, 252, 64)   36928     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 6, 134, 126, 64)   0         
_________________________________________________________________
time_distributed_4 (TimeDist (None, 6, 134, 126, 64)   256       
_________________________________________________________________
time_distributed_5 (TimeDist (None, 6, 132, 124, 128)  73856 

In [32]:
#Predict
gc.collect()
batch_size = 1024
total = 0
next_index = 0
predictions = []
for index in range(0, test_path.shape[0], batch_size):
    print(test_path.shape[0]-next_index)
    next_index = index+batch_size if test_path.shape[0]-next_index >= batch_size else test_path.shape[0]
    print((index, next_index))
    batch = load_batches(test_path, index, next_index)
    predictions.extend(model.predict(batch))
    total+=batch_size
    del batch
    gc.collect()
    print("processed so far: [%i]"%(total))

39995
(0, 1024)
processed so far: [1024]
38971
(1024, 2048)
processed so far: [2048]
37947
(2048, 3072)
processed so far: [3072]
36923
(3072, 4096)
processed so far: [4096]
35899
(4096, 5120)
processed so far: [5120]
34875
(5120, 6144)
processed so far: [6144]
33851
(6144, 7168)
processed so far: [7168]
32827
(7168, 8192)
processed so far: [8192]
31803
(8192, 9216)
processed so far: [9216]
30779
(9216, 10240)
processed so far: [10240]
29755
(10240, 11264)
processed so far: [11264]
28731
(11264, 12288)
processed so far: [12288]
27707
(12288, 13312)
processed so far: [13312]
26683
(13312, 14336)
processed so far: [14336]
25659
(14336, 15360)
processed so far: [15360]
24635
(15360, 16384)
processed so far: [16384]
23611
(16384, 17408)
processed so far: [17408]
22587
(17408, 18432)
processed so far: [18432]
21563
(18432, 19456)
processed so far: [19456]
20539
(19456, 20480)
processed so far: [20480]
19515
(20480, 21504)
processed so far: [21504]
18491
(21504, 22528)
processed so far: [2252

In [33]:
predictions = np.array(predictions)

In [34]:
predictions = predictions.reshape(-1)

In [35]:
preds_list = []
for index in range(0, test_path.shape[0]):
    preds_list.append((test_path[index].split('/')[5].split('.')[0], predictions[index]))

In [36]:
pred_df = pd.DataFrame(preds_list, columns=['id', 'target'])

In [37]:
pred_df.sample(n=5)

Unnamed: 0,id,target
22138,a80919159a2d26e,0.506242
38612,da71e6701c30039,0.506242
7748,f49c3490943f8ab,0.506242
591,7f1c1b1294faa6e,0.506242
15345,88bb45d6025b83e,0.506242


In [38]:
pred_df.groupby(by='target').size()/pred_df.shape[0]

target
0.506242    1.0
dtype: float64

In [39]:
import datetime

In [40]:
assert pred_df.shape[0] == 39995

In [41]:
pred_df.to_csv('submission_%s.csv'%(datetime.datetime.now().strftime("%Y-%m-%d-%HH")), index=False)