In [15]:
# Imports and Setup

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import random
import os
from obspy.core import read
from sklearn.model_selection import train_test_split
from spectrogram_to_array import spectrogram
import time


In [2]:
# Loading Event Data

# Path setup
main_path = os.path.abspath("")
all_events_file_path = os.path.join(main_path, 'earthquakes_filtered.txt')  # all events
all_events = pd.read_csv(all_events_file_path, sep=',')

try:
    all_events.drop(columns=['Unnamed: 0'], inplace=True)  # automatically created column (idk why)
except:
    pass

all_events.head()


Unnamed: 0,event_id,event_ID,year,month,day,hour,minute,second,lat,lng,depth,mag_ML,std_dev_ML,mag_MA,std_dev_MA,category
0,0,0,2007,1,1,2,41,13.28,-21.65559,-68.41471,121.33,2.345,0.02,2.394,0.029,0
1,1,1,2007,1,1,2,47,7.83,-20.54848,-69.05857,102.79,1.114,0.033,1.305,0.031,0
2,2,2,2007,1,1,3,50,29.15,-21.86299,-68.53639,110.95,2.779,0.031,2.917,0.031,0
3,3,3,2007,1,1,4,19,27.82,-20.29515,-69.13106,95.79,1.401,0.017,1.571,0.023,0
4,4,4,2007,1,1,5,40,2.58,-21.23847,-70.05151,34.64,1.995,0.022,2.222,0.018,0


In [52]:
# Prepare Dataset

# Preparing file list from geofon_waveforms folder
dataset_size = 50000
file_list = os.listdir(os.path.join(main_path, "geofon_waveforms"))
file_list = [int(file[:-6]) for file in file_list if file.endswith(".mseed")]

# Select random sample of N events from all files
file_list = random.sample(file_list, dataset_size)

# Train-test split
train_events, test_events = train_test_split(file_list, test_size=0.2, random_state=42)

# Merge with events data
train_events = pd.DataFrame(train_events, columns=['event_id']).merge(all_events, on='event_id')
test_events = pd.DataFrame(test_events, columns=['event_id']).merge(all_events, on='event_id')


In [53]:
# Dataset Class

class event_dataset(Dataset):
    def __init__(self, dataset_type: str, transform=None):
        if dataset_type not in ['train', 'test']:
            raise KeyError("dataset_type has to be 'train' or 'test'")
        
        self.dataframe = train_events if dataset_type == "train" else test_events
        self.data_directory = "geofon_waveforms"
        self.transform = transform
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        event_id, label = int(row['event_id']), int(row['category'])

        # Waveform fetch
        file_name = f"{event_id}.mseed"
        waveform = read(os.path.join(main_path, self.data_directory, file_name))

        # Spectrogram preparation
        spec_data = [spectrogram(data=trace.data, samp_rate=40.0, log=True, wlen=2, per_lap=0.5, dbscale=False)[0] for trace in waveform]
        spec_data = np.stack(spec_data, axis=0, dtype=np.float32)

        # Convert to torch tensors
        label = torch.tensor(label, dtype=torch.int64)
        spec_data = torch.from_numpy(spec_data)

        sample = {'label': label, 'data': spec_data}
        if self.transform:
            sample['data'] = self.transform(sample['data'])

        return sample


In [54]:
# Normalization Function

def normalize_tensor(tensor):
    return (tensor - tensor.mean()) / tensor.std()


In [55]:
# CNN Model

class Simple_CNN_v2(nn.Module):
    def __init__(self):
        super(Simple_CNN_v2, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=5, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(33728, 1024)
        self.fc2 = nn.Linear(1024, 1)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))

        x = x.view(-1, 33728)
        x = F.relu(self.fc1(x))

        return self.fc2(x)  # Returning logits directly


In [56]:
# Data Loaders

batch_size = 4
train_loader = DataLoader(
    dataset=event_dataset(dataset_type='train', transform=normalize_tensor),
    batch_size=batch_size,
    shuffle=True,
    num_workers=0
)

test_loader = DataLoader(
    dataset=event_dataset(dataset_type='test', transform=normalize_tensor),
    batch_size=batch_size,
    shuffle=True,
    num_workers=0
)


In [57]:
# Training Setup

# Model, loss, optimizer setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Simple_CNN_v2().to(device)

loss_function = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)


In [58]:
# Training and Validation Functions

def train_one_epoch(N_batch_stats: int):
    model.train(True)
    running_loss = 0.0
    running_accuracy = 0.0
    
    start_time = time.time()  # Start time

    for batch_idx, sample in enumerate(train_loader):
        labels = sample['label'].to(device)
        data = sample['data'].to(device)

        optimizer.zero_grad()

        labels_logit = model(data).squeeze()
        labels_pred = torch.round(torch.sigmoid(labels_logit))

        correct = torch.sum(labels == labels_pred).item()
        running_accuracy += correct / batch_size

        loss = loss_function(labels_logit, labels.float())
        running_loss += loss.item()

        loss.backward()
        optimizer.step()

        if batch_idx % N_batch_stats == N_batch_stats - 1:
            avg_loss = running_loss / N_batch_stats
            avg_acc = (running_accuracy / N_batch_stats) * 100
            print(f'Batch {batch_idx + 1}, Loss: {avg_loss:.3f}, Accuracy: {avg_acc:.1f}%')

            running_loss = 0.0
            running_accuracy = 0.0

    end_time = time.time()  # End time
    elapsed_time = end_time - start_time
    print(f"Training Time for one epoch: {elapsed_time:.2f} seconds")


def validate_one_epoch():
    model.train(False)
    running_loss = 0.0
    running_accuracy = 0.0

    start_time = time.time()  # Start time

    for i, sample in enumerate(test_loader):
        true_labels = sample['label'].to(device)
        inputs = sample['data'].to(device)

        with torch.no_grad():
            labels_logit = model(inputs).squeeze()
            labels_pred = torch.round(torch.sigmoid(labels_logit))

            correct = torch.sum(true_labels == labels_pred).item()
            running_accuracy += correct / batch_size
            loss = loss_function(labels_logit, true_labels.float())
            running_loss += loss.item()

    avg_loss = running_loss / len(test_loader)
    avg_acc = (running_accuracy / len(test_loader)) * 100
    print(f'Val Loss: {avg_loss:.3f}, Val Accuracy: {avg_acc:.1f}%')
    print('***************************************************')

    end_time = time.time()  # End time
    elapsed_time = end_time - start_time
    print(f"Validation Time for one epoch: {elapsed_time:.2f} seconds")



In [59]:
# Training Loop

# Initialize model, optimizer, and loss function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Simple_CNN_v2().to(device)

loss_function = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

num_epochs = 4
N_batch_print_stats = 100

for epoch_idx in range(num_epochs):
    print(f'Epoch: {epoch_idx + 1}\n')
    train_one_epoch(N_batch_stats=N_batch_print_stats)
    validate_one_epoch()

print('Finished training')


Epoch: 1

Batch 100, Loss: 0.635, Accuracy: 92.8%
Batch 200, Loss: 0.266, Accuracy: 92.8%
Batch 300, Loss: 0.263, Accuracy: 92.2%
Batch 400, Loss: 0.240, Accuracy: 94.2%
Training Time for one epoch: 40.96 seconds
Val Loss: 0.261, Val Accuracy: 91.8%
***************************************************
Validation Time for one epoch: 10.01 seconds
Epoch: 2

Batch 100, Loss: 0.238, Accuracy: 93.0%
Batch 200, Loss: 0.176, Accuracy: 95.2%
Batch 300, Loss: 0.196, Accuracy: 94.2%
Batch 400, Loss: 0.260, Accuracy: 92.5%
Training Time for one epoch: 40.62 seconds
Val Loss: 0.269, Val Accuracy: 92.2%
***************************************************
Validation Time for one epoch: 10.00 seconds
Epoch: 3

Batch 100, Loss: 0.245, Accuracy: 94.2%
Batch 200, Loss: 0.246, Accuracy: 94.5%
Batch 300, Loss: 0.216, Accuracy: 93.5%
Batch 400, Loss: 0.189, Accuracy: 93.0%
Training Time for one epoch: 40.99 seconds
Val Loss: 0.250, Val Accuracy: 92.2%
***************************************************
Vali

In [18]:
# Saving the Model

PATH = './simple_cnn.pth'
torch.save(model.state_dict(), PATH)
