# NNs don't do well on this dataset?

In [24]:
import numpy as np
import pandas as pd
import pickle
import os, sys
import os.path as osp
from glob import glob
import matplotlib.pyplot as plt
import time

from multiprocessing import Pool, Process, Queue

from sklearn.svm import SVR, NuSVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

%matplotlib inline

In [2]:
SEGMENTS_DIR = '/workspace/persistent-data/earthquake/segments'

In [3]:
SEGMENTS = glob(osp.join(SEGMENTS_DIR, '*.pickle'))

In [4]:
len(SEGMENTS)

153600

In [5]:
with open(SEGMENTS[0], 'rb') as f:
    df = pickle.load(f)

In [6]:
df.head(3)

Unnamed: 0,t_minus,acoustic_data
0,-1.4681,-13
1,-1.4681,-15
2,-1.4681,-7


In [7]:
OUT_PICKLES = '/workspace/persistent-data/earthquake/summaries'

In [8]:
!mkdir -p {OUT_PICKLES}

In [9]:
def write_summary(segfile):
    df = pd.read_pickle(segfile)
    feat = 'acoustic_data'
    tgt = 't_minus'

    f_min = df[feat].min()
    f_max = df[feat].max()
    f_mean = df[feat].mean()
    f_std = df[feat].std()

    target = df[tgt].iloc[-1]

    out_fn = osp.join(OUT_PICKLES, osp.split(segfile)[1].split('.')[0] + '.pickle')
    with open(out_fn, 'wb') as f:
        pickle.dump({'f_min': f_min, 'f_max': f_max, 
                     'f_mean': f_mean, 'f_std': f_std,
                     'target': target}, f)

In [10]:
def do_process(q, processor):
    while True:
        try:
            element = q.get(timeout=0.5)
        except:
            break
        
        processor(element)

In [11]:
q = Queue(maxsize=len(SEGMENTS))
for seg in SEGMENTS:
    q.put(seg)

In [14]:
def timestamp():
    return time.strftime('%Y-%m-%d %H:%M:%S')
timestamp()

'2019-01-30 05:29:03'

In [15]:
procs = [Process(name='process_%d' % i, target=do_process, args=(q, write_summary)) for i in range(24)]
for p in procs:
    p.start()
    
while True:
    sz = q.qsize()
    print('[%s] %d' % (timestamp(), sz))
    if sz == 0:
        break
    time.sleep(10)

for p in procs:
    p.terminate()
    p.join()

[2019-01-30 05:30:17] 153569
[2019-01-30 05:30:27] 95626
[2019-01-30 05:30:37] 45588
[2019-01-30 05:30:47] 0


In [16]:
SUMMARIES = glob(osp.join(OUT_PICKLES, '*.pickle'))

In [17]:
sumdf = pd.DataFrame.from_records([pd.read_pickle(p) for p in SUMMARIES])

In [18]:
len(sumdf)

153600

In [19]:
sumdf.head(3)

Unnamed: 0,f_max,f_mean,f_min,f_std,target
0,43.0,5.19873,-28.0,5.836566,-1.468095
1,17.0,4.991699,-6.0,2.917076,-1.463795
2,37.0,4.955566,-23.0,5.612995,-1.465895


In [20]:
sumdf.to_pickle(osp.join(OUT_PICKLES, 'all.pickle'))

In [9]:
sumdf = pd.read_pickle(osp.join(OUT_PICKLES, 'all.pickle'))

In [12]:
sumdf = sumdf.iloc[np.random.permutation(len(sumdf))]

In [14]:
svr = SVR()
N_train = int(0.9 * len(sumdf))

In [15]:
scaler = StandardScaler()
scaler.fit(sumdf.iloc[:N_train].values)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
vals = scaler.transform(sumdf.values)

In [17]:
svr.fit(vals[:N_train,:4], vals[:N_train,-1])

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
preds = svr.predict(vals[N_train:,:4])

In [19]:
preds = pd.DataFrame(scaler.inverse_transform(np.hstack((vals[N_train:,:4], np.asarray([preds]).T))),
                     columns=sumdf.columns.tolist()[:4] + ['pred'])

In [23]:
print('mae=%.4f' % mean_absolute_error(sumdf['target'].iloc[N_train:], preds['pred']))

mae=2.7308


In [25]:
nusvr = NuSVR()
nusvr.fit(vals[:N_train,:4], vals[:N_train,-1])

NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='auto_deprecated',
   kernel='rbf', max_iter=-1, nu=0.5, shrinking=True, tol=0.001,
   verbose=False)

In [26]:
nupreds = nusvr.predict(vals[N_train:,:4])

In [27]:
nupreds = pd.DataFrame(scaler.inverse_transform(np.hstack((vals[N_train:,:4], np.asarray([nupreds]).T))),
                     columns=sumdf.columns.tolist()[:4] + ['pred'])

In [28]:
print('mae=%.4f' % mean_absolute_error(sumdf['target'].iloc[N_train:], nupreds['pred']))

mae=2.7471
