<a href="https://colab.research.google.com/github/navanil018/Kaggle/blob/master/Earthquake_Challenge_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Install & Import Dependencies




In [0]:
#To access Kaggle datasets
!pip install kaggle

#Math operations
!pip install numpy

#Machine Learning
!pip install catboost

In [0]:
#data preprocessing
import pandas as pd

#math operations
import numpy as np

#machine learning
from catboost import CatBoostRegressor, Pool

#Data Scaling
from sklearn.preprocessing import StandardScaler

#Hyperparameter optimization
from sklearn.model_selection import GridSearchCV

#Support vector machine model
from sklearn.svm import SVR, NuSVR

#kernel ridge model
from sklearn.kernel_ridge import KernelRidge

#data viz
import matplotlib.pyplot as plt

# **Import DataSet from Kaggle**

In [38]:
#!mkdir .kaggle
%cd .kaggle

/content/.kaggle


In [0]:
#kaggle.json into the folder where the API expects to find it
import json
token = {"username":"navanil","key":"8b871f2214006e81c5b396aff28acb44"}
with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)

In [0]:
!chmod 600 /root/.kaggle/kaggle.json

In [50]:
#Colab's file access feature
from google.colab import files 

!kaggle competitions list

ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       2462           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge       9999            True  
house-prices-advanced-regression-techniques    2030-01-01 00:00:00  Getting Started  Knowledge       4071           False  
imagenet-object-localization-challenge         2029-12-31 07:00:00  Research         Knowledge         35           False  
competitive-data-science-predict-future-sales  2019-12-31 23:59:00  Playground           Kudos       2447           False  
two-sigma-financial-news                       2019-07-15 23:59:00  Featured          $100,000       2927           False  
LANL-Ear

In [53]:
#%pwd
#download Earthquake data
#!kaggle competitions download -c LANL-Earthquake-Prediction
#!mkdir ~/.kaggle
#!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
%cd ~/.kaggle/
!ls -a

%pwd

!kaggle config set -n path -v{/content}

/root/.kaggle
.  ..  kaggle.json
- path is now set to: {/content}


In [59]:
#download Earthquake data
!kaggle competitions download -c LANL-Earthquake-Prediction -p /content

Downloading sample_submission.csv to /content
  0% 0.00/33.3k [00:00<?, ?B/s]
100% 33.3k/33.3k [00:00<00:00, 40.3MB/s]
Downloading test.zip to /content
 98% 237M/242M [00:01<00:00, 164MB/s]
100% 242M/242M [00:01<00:00, 159MB/s]
Downloading train.csv.zip to /content
100% 2.02G/2.03G [00:36<00:00, 33.9MB/s]
100% 2.03G/2.03G [00:37<00:00, 58.5MB/s]


In [0]:
#!unzip test.csv.zip


# Exploratory Data Analysis


In [0]:
#Extract training data to dataframe
train = pd.read_csv('train.csv', nrows=6000000, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

In [76]:
#print first 10 entries
train.head(10)

Unnamed: 0,acoustic_data,time_to_failure
0,12,1.4691
1,6,1.4691
2,8,1.4691
3,5,1.4691
4,8,1.4691
5,8,1.4691
6,9,1.4691
7,7,1.4691
8,-5,1.4691
9,3,1.4691


# Feature Engineering

In [0]:
# Step 4 - Feature Engineering and signifiance of these statistical features

#lets create a function to generate some statistical features based on the training data
def gen_features(X):
    strain = []
    strain.append(X.mean())
    strain.append(X.std())
    strain.append(X.min())
    strain.append(X.max())
    strain.append(X.kurtosis())
    strain.append(X.skew())
    strain.append(np.abs(X).max())
    strain.append(np.abs(X).mean())
    strain.append(np.abs(X).std())
    return pd.Series(strain)

In [0]:
train = pd.read_csv('train.csv', iterator=True, chunksize=150_000, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

X_train = pd.DataFrame()
y_train = pd.Series()
for df in train:
  ch = gen_features(df['acoustic_data'])
  X_train = X_train.append(ch, ignore_index=True)
  y_train = y_train.append(pd.Series(df['time_to_failure'].values[-1]))

In [95]:
X_train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,4195.0,4195.0,4195.0,4195.0,4195.0,4195.0,4195.0,4195.0,4195.0
mean,4.519475,6.547788,-149.190942,163.522288,68.297997,0.12583,170.046246,5.547367,5.750165
std,0.256049,8.503939,265.087984,272.930331,70.532565,0.477901,296.887015,1.517038,8.339211
min,3.596313,2.80272,-5515.0,23.0,0.648602,-4.091826,23.0,4.147707,2.589085
25%,4.349497,4.478637,-154.0,92.0,28.090227,-0.040779,94.0,5.061843,3.86281
50%,4.522147,5.618798,-111.0,123.0,45.816625,0.08562,127.0,5.380853,4.781513
75%,4.69335,6.880904,-79.0,170.0,78.664202,0.25393,175.0,5.748553,5.887947
max,5.391993,153.703569,-15.0,5444.0,631.158927,4.219429,5515.0,32.762073,150.432368


In [96]:
#Model #1 - Catboost

train_pool = Pool(X_train, y_train)
m = CatBoostRegressor(iterations=10000, loss_function='MAE', boosting_type='Ordered')
m.fit(X_train, y_train, silent=True)
m.best_score_

{'learn': {'MAE': 1.7976107174233478}}

In [97]:
#Model #2 - Support Vector Machine w/ RBF + Grid Search

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import NuSVR, SVR


scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

parameters = [{'gamma': [0.001, 0.005, 0.01, 0.02, 0.05, 0.1],
               'C': [0.1, 0.2, 0.25, 0.5, 1, 1.5, 2]}]
               #'nu': [0.75, 0.8, 0.85, 0.9, 0.95, 0.97]}]

reg1 = GridSearchCV(SVR(kernel='rbf', tol=0.01), parameters, cv=5, scoring='neg_mean_absolute_error')
reg1.fit(X_train_scaled, y_train.values.flatten())
y_pred1 = reg1.predict(X_train_scaled)

print("Best CV score: {:.4f}".format(reg1.best_score_))
print(reg1.best_params_)

Best CV score: -2.2583
{'C': 2, 'gamma': 0.1}
