# Libraries

In [0]:
from google.colab import files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential
from keras.layers import Input, Dense, LSTM, Flatten, Dropout, Activation
from keras import Model
from keras import backend as K
import keras.metrics
from keras.layers import Conv1D
from keras.layers import BatchNormalization
import glob, os

# Data Loading

In [0]:
#Loading kaggle.json
uploaded = files.upload()
#Installing kaggle API
!pip install kaggle

Saving kaggle.json to kaggle.json


**Run the cell bellow two times... I don't know why, it doesn't find the path the first time...**

In [0]:

#Copy key to the relevant folder
!cp kaggle.json /root/.kaggle/kaggle.json
#Download dataset
!kaggle competitions download -c LANL-Earthquake-Prediction
#Unzip dataset
!unzip train.csv.zip

Downloading sample_submission.csv to /content
  0% 0.00/33.3k [00:00<?, ?B/s]
100% 33.3k/33.3k [00:00<00:00, 28.8MB/s]
Downloading test.zip to /content
100% 242M/242M [00:01<00:00, 105MB/s]

Downloading train.csv.zip to /content
 99% 2.01G/2.03G [00:15<00:00, 135MB/s]
100% 2.03G/2.03G [00:16<00:00, 130MB/s]
Archive:  train.csv.zip
  inflating: train.csv               


# Load dataset

Due to the high number of rows,wil specify low memory usage and precise type of each column

In [0]:
df = pd.read_csv('train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32}, low_memory=True).values

# Split dataset

In [6]:
events = np.array([  5656574,  50085878, 104677356, 138772453, 187641820, 218652630,
      245829585, 307838917, 338276287, 375377848, 419368880, 461811623,
      495800225, 528777115, 585568144, 621985673])


def gen_index(seg_len):
    """This function generate a list of initial value for the splitting of the dataset"""
    
    #Initiation of the list of index
    list_index = []
    
    #Number of tables that we can fit between two indexes
    num_tables = int(np.floor(events[0])/seg_len)
    
    #Total number of lines we have has a marges
    tot_lines = events[0]-seg_len*num_tables
    
    
    #Minimum index, this is the index of previous earthquake
    ind_min = 0
    
    #This loop generate all the indexes between two indexes
    for i in range(num_tables):
        
        #If we have spare lines, we randomize a bit the index we choose
        if tot_lines:
            u = random.randint(0,int(tot_lines/10))
            tot_lines -= u
        else:
            u = 0
        
        #We add the randomized index to the current index
        ind_min +=u
        
        #We add the index to the list
        list_index.append(ind_min)
        
        #We update the index based on the length of the data
        ind_min += seg_len
        
    #We make the same, but this time we can loop over a window between two indexes
    for i in range(1,len(events)):
        #Count number of table to make
        num_tables = int(np.floor((events[i]-events[i-1])/seg_len))
        tot_lines = (events[i]-events[i-1]) - seg_len*num_tables
        ind_min = events[i-1]
        for i in range(num_tables):
            if tot_lines:
                u=random.randint(0,int(tot_lines/10))
                tot_lines-= u
            else:
                u = 0
            ind_min += u
            list_index.append(ind_min)
            ind_min += seg_len
            
    #We return the list generated        
    return np.array(list_index)

Using TensorFlow backend.


# Neural Network + training

In [0]:
#Hyperparameters

#batch size
b = 64
#Length of subtables
seg_len = 150000
#Number of epoch to iterate
epoch = 20

#We generate the indexes from where we generate the sub-tables
ind= gen_index(seg_len)
#Randomizing the index
np.random.shuffle(ind)

#Define train and test indexes
ind_train = ind[:(int(len(ind)*0.8))]
ind_test = ind[int(len(ind)*0.8):]
               
               
###Creation of a LSTM using Keras
inputs = Input(shape=(seg_len,1), name = 'input')
x = Conv1D(10, kernel_size = 10, strides = 5)(inputs)
x = BatchNormalization()(x)
x = Conv1D(100, kernel_size = 100, strides = 50)(inputs)
x = BatchNormalization()(x)
x = LSTM(128, return_sequences=True)(x)
x = BatchNormalization()(x)
x = LSTM(32)(x)
x = BatchNormalization()(x)
x = Dense(256, activation = 'relu')(x)
x = Dropout(0.3)(x)

#We make a linear regression for last layer
predict = Dense(1, activation='linear')(x)
model = Model(inputs =inputs, outputs = predict)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=[keras.metrics.mae])

#We loop over a number of epoch
for ep in range(epoch):
    #Randomisation of train set
    np.random.shuffle(ind_train)
    
    #Creation of Ytrain and Ytest
    Y_train = df[ind_train+1,1]
    Y_test = df[ind_test+1,1]
    
    #We loop over the full train set taking a batch of b subtables. To save memory space we create the sub_tables directly when we 
    #fit the NN
    for j in range(int(np.floor(len(ind_train)/b-1))):
        x_train = np.reshape(np.dstack([df[ind_train[b*j+k]:ind_train[b*j+k]+seg_len,0] for k in range(j,j+b)]), (b,seg_len,1))
        x_test = np.reshape(np.dstack([df[ind_test[k]:ind_test[k]+seg_len,0] for k in range(len(ind_test))]), (len(ind_test),seg_len,1))
        #We fit the model. We take epoch =1 so it sees only one time every batch. We are using our own epoch loop for passing
        #several times through all data
        model.fit(x_train, np.array([Y[b*j+k+1] for k in range(j,j+b)]), epochs=1, batch_size=b,
                  validation_data=(x_test, Y_test))

Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 64 samples, validate on 828 samples
Epoch 1/1
Train on 6