In [1]:
import pandas as pd
import numpy as np
import os
import pickle

from sklearn.linear_model import LinearRegression

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats

DATA_PATH = "D:\\LANLEarthquakeData"
TRAIN_DATA_PATH = f"{DATA_PATH}\\train.csv"
TEST_DATA_PATH = f"{DATA_PATH}\\test"
SUBMISSON_PATH = f"{DATA_PATH}\\sample_submission.csv"
TRAINING_DERIVED_ROW_COUNT = 150_000
READ_WHOLE_TRAIN_DATA = True
READ_WHOLE_TEST_DATA = True
NP_DATA_PATH = f"{DATA_PATH}\\np"
PICKLE_PATH = f"{DATA_PATH}\\pickle"

TOTAL_ROW_COUNT = 629145480

In [2]:
%%time
df = pd.read_csv(TRAIN_DATA_PATH, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32}, nrows=TOTAL_ROW_COUNT//1500, skiprows=0*TOTAL_ROW_COUNT//1500)

Wall time: 135 ms


In [3]:
def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)    
    return lr.coef_[0]

def add_trend_feature_abs(arr):
    return add_trend_feature(arr, abs_values=True)

def classic_sta_lta(x, length_sta, length_lta):    
    sta = np.cumsum(x ** 2)

    # Convert to float
    sta = np.require(sta, dtype=np.float)
    # Copy for LTA
    lta = sta.copy()
    # Compute the STA and the LTA
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta
    # Pad zeros
    sta[:length_lta - 1] = 0
    # Avoid division by zero by setting zero values to tiny float
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny

    return sta / lta

def mean_change_abs(x):
    return np.mean(np.diff(x))

def mean_change_rate(x):
    return np.mean(np.nonzero((np.diff(x) / x[:-1]))[0])

def abs_max(x):
    return np.abs(x).max()

def abs_min(x):
    return np.abs(x).min()

def abs_mean(x):
    return np.abs(x).mean()

def abs_std(x):
    return np.abs(x).std()

def q95(x):
    return np.quantile(x, 0.95)

def q99(x):
    return np.quantile(x, 0.99)

def q05(x):
    return np.quantile(x, 0.05)

def q01(x):
    return np.quantile(x, 0.01)

def abs_q95(x):
    return np.quantile(np.abs(x), 0.95)

def abs_q99(x):
    return np.quantile(np.abs(x), 0.99)

def abs_q05(x):
    return np.quantile(np.abs(x), 0.05)

def abs_q01(x):
    return np.quantile(np.abs(x), 0.01)    

In [4]:
%%time
df["mean"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).mean()
df["sum"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).sum()
df["median"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).median()
df["var"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).var()
df["std"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).std()
df["min"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).min()
df["max"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).max()
df["cov"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).cov()
df["skew"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).skew()
df["kurt"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).kurt()

Wall time: 896 ms


In [6]:
%%time
df["mean_change_abs"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(mean_change_abs,raw=False)
df["mean_change_rate"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(mean_change_rate,raw=False)
df["abs_max"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(abs_max,raw=False)
df["abs_min"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(abs_min,raw=False)
df["abs_mean"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(abs_max,raw=False)
df["abs_std"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(abs_min,raw=False)
df["q95"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(q95,raw=False)
df["q99"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(q99,raw=False)
df["q05"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(q05,raw=False)
df["q01"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(q01,raw=False)
df["abs_q95"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(abs_q95,raw=False)
df["abs_q99"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(abs_q99,raw=False)
df["abs_q05"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(abs_q05,raw=False)
df["abs_q01"]=df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(abs_q01,raw=False)

  return getattr(obj, method)(*args, **kwds)


KeyboardInterrupt: 

In [7]:
df

Unnamed: 0,acoustic_data,time_to_failure,mean,sum,median,var,std,min,max,cov,skew,kurt,mean_change_abs,mean_change_rate,abs_max
0,12,1.469100,,,,,,,,,,,,,
1,6,1.469100,,,,,,,,,,,,,
2,8,1.469100,,,,,,,,,,,,,
3,5,1.469100,,,,,,,,,,,,,
4,8,1.469100,,,,,,,,,,,,,
5,8,1.469100,,,,,,,,,,,,,
6,9,1.469100,,,,,,,,,,,,,
7,7,1.469100,,,,,,,,,,,,,
8,-5,1.469100,,,,,,,,,,,,,
9,3,1.469100,,,,,,,,,,,,,


In [None]:
%time
df["trend"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(add_trend_feature,raw=False)
df["trend_abs"] = df["acoustic_data"].rolling(window=TRAINING_DERIVED_ROW_COUNT).apply(add_trend_feature_abs,raw=False)