In [6]:
%pip install requests aria2 netCDF4 numpy xarray tensorflow scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 MB[0m [31m18.4 MB/s[0m eta [3

In [1]:
import os

# Constants
DOWNLOAD_DATA = True
DATA_DIR = './data'  # Directory containing .tar.gz files
# Temporary file for download links
TMP_FILE = os.path.join(DATA_DIR, 'tmp.txt')
EXTRACT_DIR = os.path.join(DATA_DIR, 'extracted')

# Bucket and endpoint configuration
CUSTOM_ENDPOINT = "bbproxy.meyerstk.com/file"
APP = "TorNetBecauseZenodoSlow"  # Bucket name

In [4]:
import logging
import subprocess
import tarfile

# Setup logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s - %(levelname)s - %(message)s")

# Ensure directories exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(EXTRACT_DIR, exist_ok=True)


def download_links(links):
    """
    Download files from the provided links using aria2c.
    Uses a file named tmp.txt in DATA_DIR for links.
    """
    try:
        # Write links to tmp.txt
        with open(TMP_FILE, 'w') as file:
            file.writelines(link + '\n' for link in links)
        logging.info(f"Temporary file created: {TMP_FILE}")

        # Run aria2c to download files
        logging.info(f"Starting downloads for links: {', '.join(links)}")
        command = [
            "aria2c",
            "-j", "5",                # Download up to 3 files concurrently
            "-x", "16",               # Use up to 16 connections per file
            # "--console-log-level=info",
            "-s", "16",               # Split each file into 16 segments
            "--dir", DATA_DIR,        # Specify the download directory
            "-i", TMP_FILE            # Input file with download links
        ]
        subprocess.run(command, check=True)
        logging.info("Downloads completed successfully.")
    except Exception as e:
        logging.error(f"Error during download: {e}")
        exit(1)
    finally:
        if os.path.exists(TMP_FILE):
            os.remove(TMP_FILE)
            logging.info(f"Temporary file deleted: {TMP_FILE}")


def download_files_with_aria():
    """
    Download files from a public Backblaze B2 bucket served via a custom endpoint using aria2c.
    """
    logging.info("Starting download process with aria2c...")

    # # List of files to download
    file_list = [
        "tornet_2013.tar.gz",
        "tornet_2014.tar.gz",
        "tornet_2015.tar.gz",
        "tornet_2016.tar.gz",
        "tornet_2017.tar.gz",
        "tornet_2018.tar.gz",
        "tornet_2019.tar.gz",
        "tornet_2020.tar.gz",
        "tornet_2021.tar.gz",
        "tornet_2022.tar.gz",
        "catalog.csv"
    ]

    # Construct the public URLs
    links = [f"https://{CUSTOM_ENDPOINT}/{APP}/{file_name}" for file_name in file_list]
    # links = [
    #     "https://zenodo.org/records/12655719/files/tornet_2022.tar.gz",
    #     "https://zenodo.org/records/12655718/files/tornet_2021.tar.gz",
    #     "https://zenodo.org/records/12655717/files/tornet_2020.tar.gz",
    #     "https://zenodo.org/records/12655716/files/tornet_2019.tar.gz",
    #     "https://zenodo.org/records/12655187/files/tornet_2018.tar.gz",
    #     "https://zenodo.org/records/12655183/files/tornet_2017.tar.gz",
    #     "https://zenodo.org/records/12655179/files/tornet_2016.tar.gz",
    #     "https://zenodo.org/records/12655151/files/tornet_2015.tar.gz",
    #     "https://zenodo.org/records/12637032/files/tornet_2014.tar.gz",
    #     "https://zenodo.org/records/12636522/files/tornet_2013.tar.gz",
    #     "https://zenodo.org/records/12636522/files/catalog.csv",
    # ]
    
    # Filter out already downloaded files
    links_to_download = [
        link for link in links
        if not os.path.exists(os.path.join(DATA_DIR, os.path.basename(link)))
    ]

    if links_to_download:
        download_links(links_to_download)
    else:
        logging.info("All files already downloaded.")


def extract_local_tar_files():
    """
    Extract all .tar.gz files from the local DATA_DIR to EXTRACT_DIR.
    """
    logging.info("Starting extraction process...")
    for file_name in os.listdir(DATA_DIR):
        if file_name.endswith('.tar.gz'):
            file_path = os.path.join(DATA_DIR, file_name)
            logging.info(f'Extracting {file_path}...')
            with tarfile.open(file_path, 'r:gz') as tar:
                tar.extractall(path=EXTRACT_DIR)
            logging.info(f'Extracted {file_path} to {EXTRACT_DIR}')


if DOWNLOAD_DATA:
    download_files_with_aria()

# Call the function to process the local .tar.gz files
extract_local_tar_files()

2024-12-04 17:45:37,593 - INFO - Starting download process with aria2c...
2024-12-04 17:45:37,593 - INFO - Temporary file created: ./data/tmp.txt
2024-12-04 17:45:37,594 - INFO - Starting downloads for links: https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/catalog.csv



12/04 17:45:37 [[1;32mNOTICE[0m] Downloading 1 item(s)


2024-12-04 17:45:41,753 - INFO - Downloads completed successfully.
2024-12-04 17:45:41,754 - INFO - Temporary file deleted: ./data/tmp.txt
2024-12-04 17:45:41,754 - INFO - Starting extraction process...
2024-12-04 17:45:41,755 - INFO - Extracting ./data/tornet_2013.tar.gz...


[#7efeee 0B/0B CN:1 DL:0B]
[#7efeee 1.2MiB/36MiB(3%) CN:1 DL:1.9MiB ETA:18s]
[#7efeee 17MiB/36MiB(46%) CN:1 DL:10MiB ETA:1s]
[#7efeee 35MiB/36MiB(98%) CN:1 DL:13MiB]

12/04 17:45:41 [[1;32mNOTICE[0m] Download complete: ./data/catalog.csv

Download Results:
gid   |stat|avg speed  |path/URI
7efeee|OK  |    13MiB/s|./data/catalog.csv

Status Legend:
(OK):download completed.


2024-12-04 17:45:54,574 - INFO - Extracted ./data/tornet_2013.tar.gz to ./data/extracted
2024-12-04 17:45:54,575 - INFO - Extracting ./data/tornet_2014.tar.gz...


KeyboardInterrupt: 

In [None]:
import tensorflow as tf
import xarray as xr
import numpy as np
import os
import pandas as pd 

DATA_DIR = './data'  # Directory containing .tar.gz files
EXTRACT_DIR = os.path.join(DATA_DIR, 'extracted')

# Constants
VARIABLES = ['DBZ', 'VEL', 'KDP', 'RHOHV', 'ZDR', 'WIDTH']
TIME_STEPS = 3
SWEEPS = 2
IMAGE_HEIGHT = 120
IMAGE_WIDTH = 240

# Catalog and label mapping
catalog_path = os.path.join(EXTRACT_DIR, "catalog.csv")
catalog = pd.read_csv(catalog_path)
catalog['label'] = catalog['category'].apply(lambda x: 1 if x == 'TOR' else 0)
label_mapping = dict(zip(catalog['filename'], catalog['label']))

# Define preprocessing function for NetCDF files
def parse_nc_file(file_path):
    """
    Parse a single NetCDF file to extract radar data and label.
    """
    file_path = file_path.numpy().decode('utf-8')  # Convert Tensor to string
    with xr.open_dataset(file_path, engine="netcdf4") as ds:
        data = np.empty((IMAGE_HEIGHT, IMAGE_WIDTH, len(VARIABLES), SWEEPS, TIME_STEPS), dtype=np.float32)
        for t in range(TIME_STEPS):
            for s in range(SWEEPS):
                for idx, var in enumerate(VARIABLES):
                    channel_data = ds[var].isel(time=t, sweep=s).values
                    np.nan_to_num(channel_data, copy=False, nan=0, posinf=0, neginf=0)
                    data[:, :, idx, s, t] = channel_data  # Keep separate dimensions

    # Transpose to match model input shape: (HEIGHT, WIDTH, VARIABLES * SWEEPS, TIME_STEPS)
    data = data.transpose(0, 1, 2, 3, 4).reshape(IMAGE_HEIGHT, IMAGE_WIDTH, len(VARIABLES) * SWEEPS, TIME_STEPS)
    label = label_mapping.get(os.path.basename(file_path), 0)  # Get label
    return data, label

def load_and_preprocess(file_path):
    """
    Wrapper for tf.data to process NetCDF files.
    """
    data, label = tf.py_function(
        func=parse_nc_file,
        inp=[file_path],
        Tout=(tf.float32, tf.int32)
    )
    data.set_shape((IMAGE_HEIGHT, IMAGE_WIDTH, len(VARIABLES) * SWEEPS, TIME_STEPS))
    label.set_shape(())
    return data, label

def create_tf_dataset(directory, batch_size=32):
    """
    Create an optimized TensorFlow dataset from NetCDF files in the directory.
    """
    # List all NetCDF files in the directory
    file_paths = tf.data.Dataset.list_files(f"{directory}/**/*.nc", shuffle=True)
    
    # Apply preprocessing
    dataset = file_paths.map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Batch early, shuffle, and prefetch
    dataset = dataset.batch(batch_size).shuffle(buffer_size=10000).prefetch(tf.data.AUTOTUNE)
    return dataset

X_train = create_tf_dataset(os.path.join(EXTRACT_DIR, "train"), batch_size=32)
X_test = create_tf_dataset(os.path.join(EXTRACT_DIR, "test"), batch_size=32)

for data, label in X_train.take(1):
    print(f"Data shape: {data.shape}")
    print(f"Label shape: {label.shape}")

I0000 00:00:1733337261.010706   65628 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 79197 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-80GB, pci bus id: 0000:01:00.0, compute capability: 8.0
W0000 00:00:1733337262.922636   67384 gpu_backend_lib.cc:579] Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may result in compilation or runtime failures, if the program we try to run uses routines from libdevice.
Searched for CUDA in the following directories:
  ./cuda_sdk_lib
  ipykernel_launcher.runfiles/cuda_nvcc
  ipykern/cuda_nvcc
  
  /usr/local/cuda
  /opt/conda/lib/python3.11/site-packages/tensorflow/python/platform/../../../nvidia/cuda_nvcc
  /opt/conda/lib/python3.11/site-packages/tensorflow/python/platform/../../../../nvidia/cuda_nvcc
  /opt/conda/lib/python3.11/site-packages/tensorflow/python/platform/../../cuda
  .
You can choose the search directory by setting xla_gpu_cuda_data_dir in HloModule's DebugOptions.  For most app

In [10]:
from tensorflow.keras import models, layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.regularizers import l2

def create_3d_torcnn(input_shape=(120, 240, 36, 3), dropout_rate=0.3):
    """
    Define a 3D CNN model for tornado detection.
    """
    model = models.Sequential([
        # Block 1
        layers.Input(shape=input_shape),
        layers.Conv3D(32, (3, 3, 3), activation='relu', padding='same'),        layers.Conv3D(32, (3, 3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 1)),  # Pool across spatial dimensions only
        layers.Dropout(dropout_rate),

        # Block 2
        layers.Conv3D(64, (3, 3, 3), activation='relu', padding='same'),
        layers.Conv3D(64, (3, 3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),  # Pool across spatial and temporal dimensions
        layers.Dropout(dropout_rate),

        # Block 3
        layers.Conv3D(128, (3, 3, 3), activation='relu', padding='same'),
        layers.Conv3D(128, (3, 3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),
        layers.Dropout(dropout_rate),

        # Block 4
        layers.Conv3D(256, (3, 3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),
        layers.Dropout(dropout_rate),

        # Fully Connected Layers
        layers.Flatten(),
        layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        layers.BatchNormalization(),
        layers.Dropout(0.4),

        # Output Layer
        layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss=BinaryCrossentropy(),
        metrics=['accuracy', Precision(), Recall(), AUC()]
    )
    return model

# Create Model
# Note: Add the temporal dimension to the input shape (TIME_STEPS = 3).
input_shape = (120, 240, len(VARIABLES) * SWEEPS, TIME_STEPS)
model = create_3d_torcnn(input_shape=input_shape)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
import numpy as np
# import matplotlib.pyplot as plt

# Hyperparameters
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.0005
DROPOUT_RATE = 0.3

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,  # Reduce learning rate by a factor of 0.5
    patience=3,  # Wait 3 epochs of no improvement before reducing
    min_lr=1e-6,  # Lower bound for the learning rate
    verbose=1  # Print updates when learning rate is reduced
)

# Start Training
print("Starting model training...")
history = model.fit(
    X_train,  # Training dataset with features and labels
    epochs=EPOCHS,
    validation_data=X_test,  # Validation dataset with features and labels
    callbacks=[reduce_lr, early_stopping]
)

# Evaluate the Model
print("Evaluating the model...")
results = model.evaluate(X_test)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

# Extract Features and Labels for Detailed Metrics
X_test_features = []
y_test_labels = []

for features, labels in X_test:
    X_test_features.append(features.numpy())
    y_test_labels.append(labels.numpy())

X_test_features = np.concatenate(X_test_features, axis=0)
y_test_labels = np.concatenate(y_test_labels, axis=0)

# Predictions
y_pred = (model.predict(X_test_features) > 0.5).astype(int)

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred))

# Classification Report
print("Classification Report:")
print(classification_report(y_test_labels, y_pred))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test_labels, model.predict(X_test_features))
# plt.figure()
# plt.plot(fpr, tpr, label='ROC Curve')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.legend(loc='lower right')
# plt.show()


Starting model training...
Epoch 1/50
