In [2]:
# !git clone https://github.com/m-zayan/ml_utils.git

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from copy import deepcopy
from glob import glob

from tqdm.notebook import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

plt.style.use('ggplot')

working_dir = '../input/predict-volcanic-eruptions-ingv-oe/'

In [2]:
train_dir = glob(working_dir + 'train/*')
test_dir = glob(working_dir + 'test/*')

In [3]:
metadata = pd.read_csv(working_dir + 'train.csv')
sample_submission = pd.read_csv(working_dir + 'sample_submission.csv')

sample_submission.head(2)

Unnamed: 0,segment_id,time_to_eruption
0,1000213997,0
1,100023368,0


 # Data Cleaning

In [4]:
def is_nan_stat(_dir, _ord=None):
    
    def list_cusum(lst):
    
        out = []
        tmp = []
        k = 0

        for i in range(0, len(lst)):

            for j in range(k, i + 1):

                tmp.append(lst[j])
                k = j + 1


            out.append(deepcopy(tmp))

        return out



    isna = {}
    df_nan = {}
    mn_size = 1e9
    
    for i in tqdm(range(0 , len(_dir))):

        x = pd.read_csv(_dir[i])
        mn_size = min(mn_size, len(x))
        
        df_nan[i] = (x.isna().sum().values == len(x)).astype('int')
        
        tmp = x.isna().sum()
        
        if i == 0:
            
            for col in x.keys():
                
                isna[col] = 0
        
        for col in tmp.keys():
            
            isna[col] += tmp[col]
        
    
    keys = _ord
    
    if _ord is None:
        
        isna = [(j, i) for i, j in isna.items()]

        isna.sort()

        keys = [j for i,j in isna]
    
    else:
        
        isna = [(isna[keys[i]], keys[i]) for i in range(len(keys))]
        
    ckeys = list_cusum(keys)
    
    values = [i for i, j in isna]
    values = np.array(values)
    
    cusum_nan = np.cumsum(values)
    size = np.repeat(mn_size * len(_dir), 10)

    worst_case_size = size - cusum_nan
    worst_case_size[worst_case_size < 0] = 0
    
    choices_size = list(zip(worst_case_size, ckeys))
    
    nan = np.array(list(df_nan.values()))
    
    return choices_size, nan # nan/sensor [from 1 to 10 in order] for each dataframe

In [5]:
# test_choices, test_nan = is_nan_stat(test_dir, _ord=None)

# features = np.where(test_nan.sum(axis=0) == 0)[0] + 1
# features = list(map(lambda f: f'sensor_{f}', features))

# print(features, '-- test size :', test_choices[1][0])

features = ['sensor_4', 'sensor_6']

# Sampling 

In [6]:
def get_id(_dir):
    
    return os.path.splitext(_dir)[0].split('/')[-1]

def get_traget(_dir):
    
    return metadata.query(f'segment_id == {get_id(_dir)}').values[0, 1]

def get_sample(_dir, sample_size=10, typ='train', seed=None):
    
    sample = None
    skip_counts = 0
    
    seg_id = []
    
    for i in tqdm(range(len(_dir))):
        
        if seed:
            seed += i
        
        x = pd.read_csv(_dir[i])[features].dropna()
                
        if len(x) < sample_size:
            
            if typ == 'test':
                
                raise ValueError(f'len(x) < sample_size, len(x) = {len(x)}, sample_size = {sample_size}, check features choice (nan)')
            
            else:
                
                skip_counts +=1
                
        x = x.sample(n=sample_size, random_state=seed)
        
        _id = get_id(_dir[i])
        _id = [_id] * len(x)
        
        seg_id +=_id

        if typ == 'train':
            
            y = get_traget(_dir[i])
        
        if i == 0:
            
            sample = x
            
            if typ == 'train':
                
                sample['target'] = [y] * len(x)
            
        else:
            
            tmp = x
            
            if typ == 'train':
                
                tmp['target'] = [y] * len(x)
            
            sample = pd.concat([sample, tmp], axis=0)
    
    sample.insert(loc=0, column='segment_id', value=seg_id)
    sample.reset_index(drop=True, inplace=True)
    
    counts = len(_dir) - skip_counts
    
    return sample, counts

In [7]:
sample_size = 1000
seed = 1

test, _ = get_sample(_dir=test_dir, sample_size=sample_size, typ='test', seed=seed)
train, counts = get_sample(_dir=train_dir, sample_size=sample_size, typ='train', seed=seed)

HBox(children=(FloatProgress(value=0.0, max=4520.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4431.0), HTML(value='')))




In [8]:
print('valid sample :', len(np.unique(test['segment_id'])) == len(test_dir))

valid sample : True


# Model

$minimize : \frac{1}{m} \sum_{i=0}^{m} \ (\hat{y} * scale - y)^2$

> $where: \hat{y} = w_0 * d_0 + w_1 * d_1 + .... + w_k * d_k$

> $subject \ to: 0 \leq w_j \leq 1$ ---> (Could be converted into implicit constraints using sigmoid function)

In [9]:
def standardize(x):
    
    return (x - x.mean(axis=0)) / x.std(axis=0)

In [10]:
X = train.to_numpy().reshape(counts, sample_size, 4)
Y = X[:, 0, -1, None].astype('float32')

X = X[:, :, 1:-1].astype('float32')

x_test = test.to_numpy().reshape(len(test_dir), sample_size, 3)
seg_id = x_test[:, 0, 0]

x_test = x_test[:, :, 1:].astype('float32')

standardized = standardize(X)
standardized_test = standardize(x_test)

print(X.shape)
print(Y.shape)

print('\ntest shape : ', x_test.shape)

# x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.1)

# print(x_train.shape)
# print(x_val.shape)

(4431, 1000, 2)
(4431, 1)

test shape :  (4520, 1000, 2)


In [11]:
offset = 1e3
scale = 1e6
step = 0.1

mx = (train['target'].max() + offset) / scale

durations_dist = np.arange(1 , mx, step=step).astype('float32')
durations_dist = durations_dist[None, :]

dist_size = durations_dist.shape[-1]

print('shape : ', durations_dist.shape)

shape :  (1, 481)


In [12]:
class WeightedSum(tf.keras.layers.Layer):
    
    def call(self, inputs):
        
        out = K.sum(inputs * durations_dist, axis=-1)
        out = out[:, None]
        
        return out

In [14]:
inputs = Input(shape=[sample_size, 2])

conv1d = Conv1D(filters=1, kernel_size=sample_size-1, activation='relu')(inputs)

batch_norm = BatchNormalization()(conv1d)

lstm = LSTM(units=1, activation='tanh', return_sequences=True)(batch_norm)
lstm = LSTM(units=dist_size, activation='sigmoid')(lstm) # weights

dropout = Dropout(0.2)(lstm)

outputs = WeightedSum()(dropout)

model = Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1000, 2)]         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2, 1)              1999      
_________________________________________________________________
batch_normalization_1 (Batch (None, 2, 1)              4         
_________________________________________________________________
lstm_2 (LSTM)                (None, 2, 1)              12        
_________________________________________________________________
lstm_3 (LSTM)                (None, 481)               929292    
_________________________________________________________________
dropout_1 (Dropout)          (None, 481)               0         
_________________________________________________________________
weighted_sum_1 (WeightedSum) (None, 1)                

In [15]:
def mse_loss(y_true, y_pred):
    
    loss = (y_pred - (y_true / scale)) ** 2
    
    return K.mean(loss)
    
def metrice(y_true, y_pred):
    
    msa = K.abs((y_pred * scale - y_true))
    
    return K.mean(msa)

In [16]:
opt = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model.compile(optimizer=opt, loss=mse_loss, metrics=metrice)

In [17]:
history = model.fit(X, Y, steps_per_epoch=256, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100


Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100


In [18]:
y_test = model.predict(x_test)
y_test *= scale

In [19]:
y_test.min(), y_test.max()

(15095701.0, 35926080.0)

In [20]:
submission = {}

submission['segment_id'] = seg_id.squeeze()
submission['time_to_eruption'] = y_test.squeeze()

submission = pd.DataFrame(submission)

submission.to_csv('submission.csv', encoding='utf-8', index=False)

In [21]:
submission.head(2)

Unnamed: 0,segment_id,time_to_eruption
0,1290851559,26189702.0
1,543483467,25291820.0
