**Name:  Nic Gibson**

**INGV Volcanic Eruption**

## 1. Data preparation

In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.utils import make_grid
from torch.utils.data import DataLoader, random_split
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR
from torchsummary import summary
import numpy as np
import pandas as pd

import plotly.express as px

device = torch.device("cuda")

In [2]:
args={}
batch_size = 32
test_batch_size = 1000
epochs = 5
learning_rate = 0.01

# Download data

In [3]:
import os

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("sample_submission.csv")

print(train_data.shape)
print(test_data.shape)

(4431, 2)
(4520, 2)


# Visualize the data

In [4]:
test_data

Unnamed: 0,segment_id,time_to_eruption
0,1000213997,0
1,100023368,0
2,1000488999,0
3,1001028887,0
4,1001857862,0
...,...,...
4515,996704281,0
4516,997630809,0
4517,998072137,0
4518,998136924,0


In [5]:
train_data

Unnamed: 0,segment_id,time_to_eruption
0,1136037770,12262005
1,1969647810,32739612
2,1895879680,14965999
3,2068207140,26469720
4,192955606,31072429
...,...,...
4426,873340274,15695097
4427,1297437712,35659379
4428,694853998,31206935
4429,1886987043,9598270


In [6]:
time_erupt_dist = px.histogram(
    train_data, 
    x="time_to_eruption",
    width=800,
    height=500,
    nbins=100,
    title='Time to eruption distribution'
)

time_erupt_dist.show()

In [7]:
fig = px.line(
    train_data, 
    y="time_to_eruption",
    width=1000,
    height=400,
    title='Time to eruption'
)

fig.show()

In [8]:
# train_dataset, val_dataset = random_split(train_data, [2431,2000])
# train_loader = DataLoader(train_dataset, batch_size=batch_size,
#                           shuffle=True, num_workers=2)


In [9]:
def agg_stats(df, idx):
    df = df.agg(['sum', 'min', "mean", "std", "median", "skew", "kurtosis"])
    df_flat = df.stack()
    df_flat.index = df_flat.index.map('{0[1]}_{0[0]}'.format)
    df_out = df_flat.to_frame().T
    df_out["segment_id"] = int(idx)
    return df_out

In [16]:

import glob

pathnames = glob.glob('train/*')

train_ids = []
for data in pathnames:
    train_ids.append(data[6:-4])

train_set = pd.DataFrame()
for signal in train_ids:
    df = pd.read_csv('train/{}.csv'.format(signal), delim_whitespace=True)
    train_set = train_set.append(agg_stats(df, signal))

print(train_set)

    sensor_1_sum  sensor_2_sum  sensor_3_sum  sensor_4_sum  sensor_5_sum  \
0      -124407.0     -162810.0       17809.0       62796.0      -14241.0   
0      -250585.0           0.0     -296849.0       -4583.0       52757.0   
0            0.0     -102300.0      223784.0     -134359.0           0.0   
0        57266.0       -8403.0       94099.0      -65566.0       -9402.0   
0        54595.0     -157032.0     -325455.0     -196279.0       -8188.0   
..           ...           ...           ...           ...           ...   
0      -223048.0      116359.0      -54704.0      242666.0       13343.0   
0      -559204.0    -5784228.0     3423636.0     2638924.0     -272457.0   
0         4159.0      329467.0      -79815.0     -196778.0     -101813.0   
0       -54923.0     -162175.0     -492176.0     -243267.0           0.0   
0      -316444.0      -82519.0           0.0       39261.0       18680.0   

    sensor_6_sum  sensor_7_sum  sensor_8_sum  sensor_9_sum  sensor_10_sum  \
0         

In [26]:
testFiles = glob.glob('test/*')

test_ids = []
for data in testFiles:
    test_ids.append(data[5:-4])

test_set = pd.DataFrame()
for signal in test_ids:
    df = pd.read_csv('test/{}.csv'.format(signal), delim_whitespace=True)
    test_set = test_set.append(agg_stats(df, signal))

print(test_set)

    sensor_1_sum  sensor_2_sum  sensor_3_sum  sensor_4_sum  sensor_5_sum  \
0       156288.0      116423.0     -346459.0      234987.0       23208.0   
0      -139697.0      192683.0     -310716.0     -243790.0        4359.0   
0       -24461.0           0.0       23203.0      -16622.0       61890.0   
0       209321.0       28663.0       64365.0     -355141.0     -109033.0   
0            0.0     -140496.0      -56523.0       73073.0           0.0   
..           ...           ...           ...           ...           ...   
0       107166.0    -1382718.0      525564.0     -499632.0     -454544.0   
0        95249.0      176066.0      137963.0       97266.0       59958.0   
0        34815.0        3659.0       -9945.0        5460.0       10354.0   
0      1021121.0      438450.0      606042.0     3482955.0     -359414.0   
0      -125205.0      -56434.0      -46305.0     -136610.0           0.0   

    sensor_6_sum  sensor_7_sum  sensor_8_sum  sensor_9_sum  sensor_10_sum  \
0       13

In [35]:
stats = train_set
features = list(stats.drop(["segment_id"], axis=1).columns)
target_name = ["time_to_eruption"]
stats = stats.merge(train_data, on="segment_id")
stats.describe()



Unnamed: 0,sensor_1_sum,sensor_2_sum,sensor_3_sum,sensor_4_sum,sensor_5_sum,sensor_6_sum,sensor_7_sum,sensor_8_sum,sensor_9_sum,sensor_10_sum,...,sensor_3_kurtosis,sensor_4_kurtosis,sensor_5_kurtosis,sensor_6_kurtosis,sensor_7_kurtosis,sensor_8_kurtosis,sensor_9_kurtosis,sensor_10_kurtosis,segment_id,time_to_eruption
count,4431.0,4431.0,4431.0,4431.0,4431.0,4431.0,4431.0,4431.0,4431.0,4431.0,...,3996.0,4431.0,3836.0,4431.0,4389.0,3991.0,4307.0,4407.0,4431.0,4431.0
mean,-31546.37,66422.64,-71154.97,-6911.549,170388.8,8371.901,-9131.113,-393.0828,-6063.665,-19701.77,...,1.779537,2.409261,7.530025,1.393769,8.016503,4.359954,2.248869,2.614465,1074694000.0,22848910.0
std,1074697.0,4234405.0,1454592.0,1199746.0,4345850.0,1432993.0,1206980.0,1390008.0,1227877.0,2005872.0,...,10.429411,7.86191,15.242927,12.582408,284.562984,22.386326,7.854015,8.776947,616196600.0,13484390.0
min,-35728930.0,-14713480.0,-34675490.0,-27296330.0,-6630852.0,-40274250.0,-47237660.0,-58156440.0,-25193250.0,-64194810.0,...,-1.163593,-0.869809,-0.936414,-1.443972,-0.861134,-1.078357,-0.876845,-1.402664,513181.0,6250.0
25%,-107299.5,-85853.0,-108719.5,-98454.0,-26998.5,-176236.0,-139680.5,-110166.5,-82632.5,-198759.0,...,0.174041,0.220774,0.843226,-0.063266,0.243225,0.036524,0.256607,0.30438,552793400.0,11270160.0
50%,1062.0,0.0,0.0,-1022.0,0.0,1836.0,7731.0,0.0,0.0,5861.0,...,0.426267,0.574138,2.154965,0.073785,0.764163,0.273745,0.669426,0.795795,1066153000.0,22465590.0
75%,109530.0,83618.5,82704.5,98504.5,26455.5,174418.5,131365.0,117311.0,87844.5,227012.5,...,1.183498,1.757019,6.659629,0.316384,2.580724,1.401245,1.803676,2.358021,1606350000.0,34343560.0
max,20473630.0,268740400.0,33312650.0,38941290.0,179985000.0,50942900.0,20353390.0,44061510.0,45129420.0,61340240.0,...,448.313264,163.849752,195.422373,414.113602,18806.195641,676.327801,264.051749,234.337549,2146939000.0,49046090.0


# Use Light GBM

In [40]:
import lightgbm as lgbm
from sklearn.model_selection import KFold
import gc

random_state = 38
n_fold = 7
folds = KFold(n_splits=n_fold, shuffle=True, random_state=random_state)

data = stats

params = {
    "n_estimators": 2000,
    "boosting_type": "gbdt",
    "metric": "mae",
    "num_leaves": 66,
    "learning_rate": 0.005,
    "verbose": 0,
    "random_state": random_state,
}

sub_preds = np.zeros(test_data.shape[0])
feature_importance = pd.DataFrame(index=list(range(n_fold)), columns=features)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
    trn_x, trn_y = data[features].iloc[trn_idx], data[target_name].iloc[trn_idx]
    val_x, val_y = data[features].iloc[val_idx], data[target_name].iloc[val_idx]
    
    model = lgbm.LGBMRegressor(**params)
    
    model.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric="mae", verbose=0, early_stopping_rounds=150
           )

    feature_importance.iloc[n_fold, :] = model.feature_importances_
    
    sub_preds += model.predict(test_data[features], num_iteration=model.best_iteration_) / folds.n_splits

You can set `force_col_wise=true` to remove the overhead.


KeyError: "None of [Index(['sensor_1_sum', 'sensor_2_sum', 'sensor_3_sum', 'sensor_4_sum',\n       'sensor_5_sum', 'sensor_6_sum', 'sensor_7_sum', 'sensor_8_sum',\n       'sensor_9_sum', 'sensor_10_sum', 'sensor_1_min', 'sensor_2_min',\n       'sensor_3_min', 'sensor_4_min', 'sensor_5_min', 'sensor_6_min',\n       'sensor_7_min', 'sensor_8_min', 'sensor_9_min', 'sensor_10_min',\n       'sensor_1_mean', 'sensor_2_mean', 'sensor_3_mean', 'sensor_4_mean',\n       'sensor_5_mean', 'sensor_6_mean', 'sensor_7_mean', 'sensor_8_mean',\n       'sensor_9_mean', 'sensor_10_mean', 'sensor_1_std', 'sensor_2_std',\n       'sensor_3_std', 'sensor_4_std', 'sensor_5_std', 'sensor_6_std',\n       'sensor_7_std', 'sensor_8_std', 'sensor_9_std', 'sensor_10_std',\n       'sensor_1_median', 'sensor_2_median', 'sensor_3_median',\n       'sensor_4_median', 'sensor_5_median', 'sensor_6_median',\n       'sensor_7_median', 'sensor_8_median', 'sensor_9_median',\n       'sensor_10_median', 'sensor_1_skew', 'sensor_2_skew', 'sensor_3_skew',\n       'sensor_4_skew', 'sensor_5_skew', 'sensor_6_skew', 'sensor_7_skew',\n       'sensor_8_skew', 'sensor_9_skew', 'sensor_10_skew', 'sensor_1_kurtosis',\n       'sensor_2_kurtosis', 'sensor_3_kurtosis', 'sensor_4_kurtosis',\n       'sensor_5_kurtosis', 'sensor_6_kurtosis', 'sensor_7_kurtosis',\n       'sensor_8_kurtosis', 'sensor_9_kurtosis', 'sensor_10_kurtosis'],\n      dtype='object')] are in the [columns]"