https://www.youtube.com/watch?v=TffGdSsWKlA

https://github.com/llSourcell/Kaggle_Earthquake_challenge/blob/master/Earthquake_Challenge.ipynb

https://www.kaggle.com/latimerb/earthquake-prediction-getting-started

# Step 1. Install & Import Dependencies

In [None]:
# to access Kaggle datasets
!pip install kaggle

# Math operations
!pip install numpy

# Machine learning
!pip install catboost

In [1]:
%matplotlib inline

# data preprocessing
import pandas as pd

# math operations
import numpy as np

# machine learning
from catboost import CatBoostRegressor, Pool

# data scaling
from sklearn.preprocessing import StandardScaler

# hyperparameter optimization
from sklearn.model_selection import GridSearchCV

# support vector machine model
from sklearn.svm import NuSVR, SVR

# kernel ridge model
from sklearn.kernel_ridge import KernelRidge

# data visualization
import matplotlib.pyplot as plt

# Step 2. Import dataset from Kaggle

In [None]:
!kaggle competitions list

In [None]:
# download data
!kaggle competitions download -c LANL-Earthquake-Prediction

In [None]:
!unzip train.csv.zip

In [None]:
!ls

# Step 3. Exploratory Data Analysis

In [None]:
# Extract training data into a dataframe for further manipulation
train = pd.read_csv('train.csv', 
                    dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

In [None]:
print(train.shape)
train.head(10)

In [None]:
train_ad_sample_df = train['acoustic_data'].values[::100]
train_ttf_sample_df = train['time_to_failure'].values[::100]

# function for plotting based on both features
def plot_acc_ttf_data(train_ad_sample_df, train_ttf_sample_df, 
                      title):
    fig, ax1 = plt.subplots(figsize=(12, 8))
    plt.title(title)
    plt.plot(train_ad_sample_df, color='r')
    ax1.set_ylabel('acoustic_data', color='r')
    plt.legend(['acoustic_data'], loc=(0.01, 0.95))
    ax2 = ax1.twinx()
    plt.plot(train_ttf_sample_df, color='b')
    ax2.set_ylabel('time to failure', color='b')
    plt.legend(['time to failure'], loc=(0.01, 0.9))
    plt.grid(True)
    
plot_acc_ttf_data(train_ad_sample_df, train_ttf_sample_df, 'Foo')
del train_ad_sample_df
del train_ttf_sample_df

# Step 4. Feature Engineering

In [2]:
def gen_features(X):
    strain = []
    strain.append(X.mean())
    strain.append(X.std())
    strain.append(X.min())
    strain.append(X.kurtosis())
    strain.append(X.skew())
    strain.append(np.quantile(X, 0.01))
    strain.append(np.quantile(X,0.05))
    strain.append(np.quantile(X,0.95))
    strain.append(np.quantile(X,0.99))
    strain.append(np.abs(X).max())
    strain.append(np.abs(X).mean())
    strain.append(np.abs(X).std())
    return pd.Series(strain)

In [3]:
train = pd.read_csv('train.csv', iterator=True, chunksize=150_000, 
                    dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

X_train = pd.DataFrame()
y_train = pd.Series()
for df in train:
    ch = gen_features(df['acoustic_data'])
    X_train = X_train.append(ch, ignore_index=True)
    y_train = y_train.append(pd.Series(df['time_to_failure'].values[-1]))

In [27]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,4.884113,5.101106,-98.0,33.662481,-0.024061,-8.0,-2.0,11.0,18.0,104.0,5.576567,4.333325
1,4.725767,6.588824,-154.0,98.758517,0.390561,-11.0,-2.0,12.0,21.0,181.0,5.734167,5.732777
2,4.906393,6.967397,-106.0,33.555211,0.217391,-15.0,-3.0,13.0,26.0,140.0,6.152647,5.895945
3,4.90224,6.922305,-199.0,116.548172,0.757278,-12.0,-2.0,12.0,22.0,199.0,5.93396,6.061214
4,4.90872,7.30111,-126.0,52.977905,0.064531,-15.0,-2.0,12.0,26.0,145.0,6.110587,6.329485


In [10]:
type(train)

pandas.io.parsers.TextFileReader

# Step 5. Implement Catboost Model

In [7]:
train_pool = Pool(X_train, y_train)
model = CatBoostRegressor(iterations=30_000, loss_function='MAE', boosting_type='Ordered')
model.fit(X_train, y_train, silent=True)
model.best_score_

{'learn': {'MAE': 1.4377688780913926}}

In [11]:
from tqdm import tqdm_notebook

In [44]:
submission = pd.read_csv('sample_submission.csv', index_col='seg_id', dtype={"time_to_failure": np.float32})
X_test_sample = pd.DataFrame(columns=df.columns, dtype=np.float64, index=submission.index)

In [34]:
print(X_test.head())
print(X_test.tail())

            acoustic_data  time_to_failure
seg_id                                    
seg_00030f            NaN              NaN
seg_0012b5            NaN              NaN
seg_00184e            NaN              NaN
seg_003339            NaN              NaN
seg_0042cc            NaN              NaN
            acoustic_data  time_to_failure
seg_id                                    
seg_ff4236            NaN              NaN
seg_ff7478            NaN              NaN
seg_ff79d9            NaN              NaN
seg_ffbd6a            NaN              NaN
seg_ffe7cc            NaN              NaN


In [46]:
X_test = pd.DataFrame()

for i, seg_id in enumerate(tqdm_notebook(X_test_sample.index)):
    seg = pd.read_csv('data/' + seg_id + '.csv')
    
    fs = gen_features(seg)
    fs.name = seg_id
    X_test.append(fs)
    
    print(fs)
    
    if i > 10:
        break
    
    


0       acoustic_data    4.49178
dtype: float64
1       acoustic_data    4.89369
dtype: float64
2              acoustic_data   -75
dtype: int64
3     acoustic_data    28.837568
dtype: float64
4      acoustic_data    0.327908
dtype: float64
5                                            -8
6                                            -2
7                                            11
8                                            18
9             acoustic_data    115
dtype: int64
10     acoustic_data    5.224607
dtype: float64
11     acoustic_data    4.102161
dtype: float64
Name: seg_00030f, dtype: object
0      acoustic_data    4.171153
dtype: float64
1      acoustic_data    5.922839
dtype: float64
2             acoustic_data   -140
dtype: int64
3     acoustic_data    56.218955
dtype: float64
4      acoustic_data    0.295708
dtype: float64
5                                           -12
6                                            -2
7                                            11
8       

In [48]:
X_test.shape

(0, 0)

In [19]:
from os import listdir
from os.path import isfile, join

files = [f for f in listdir("data/") if isfile(join("data/", f))]

files.len

AttributeError: 'list' object has no attribute 'len'

In [24]:
len(files)

2624