In [1]:
%pip install requests aria2 netCDF4 numpy xarray

Collecting aria2
  Downloading aria2-0.0.1b0-py3-none-manylinux_2_17_x86_64.whl.metadata (28 kB)
Collecting netCDF4
  Downloading netCDF4-1.7.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting xarray
  Downloading xarray-2024.11.0-py3-none-any.whl.metadata (11 kB)
Collecting cftime (from netCDF4)
  Downloading cftime-1.6.4.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Downloading aria2-0.0.1b0-py3-none-manylinux_2_17_x86_64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading netCDF4-1.7.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading xarray-2024.11.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 

In [None]:
import os
import logging
import subprocess
import tarfile

# Setup logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s - %(levelname)s - %(message)s")

# Constants
DOWNLOAD_DATA = True
DATA_DIR = './data'  # Directory containing .tar.gz files
# Temporary file for download links
TMP_FILE = os.path.join(DATA_DIR, 'tmp.txt')
EXTRACT_DIR = os.path.join(DATA_DIR, 'extracted')

# Bucket and endpoint configuration
CUSTOM_ENDPOINT = "bbproxy.meyerstk.com/file"
APP = "TorNetBecauseZenodoSlow"  # Bucket name

# Ensure directories exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(EXTRACT_DIR, exist_ok=True)


def download_links(links):
    """
    Download files from the provided links using aria2c.
    Uses a file named tmp.txt in DATA_DIR for links.
    """
    try:
        # Write links to tmp.txt
        with open(TMP_FILE, 'w') as file:
            file.writelines(link + '\n' for link in links)
        logging.info(f"Temporary file created: {TMP_FILE}")

        # Run aria2c to download files
        logging.info(f"Starting downloads for links: {', '.join(links)}")
        command = [
            "aria2c",
            "-j", "3",                # Download up to 3 files concurrently
            "-x", "16",               # Use up to 16 connections per file
            # "--console-log-level=info",
            "-s", "16",               # Split each file into 16 segments
            "--dir", DATA_DIR,        # Specify the download directory
            "-i", TMP_FILE            # Input file with download links
        ]
        subprocess.run(command, check=True)
        logging.info("Downloads completed successfully.")
    except Exception as e:
        logging.error(f"Error during download: {e}")
        exit(1)
    finally:
        if os.path.exists(TMP_FILE):
            os.remove(TMP_FILE)
            logging.info(f"Temporary file deleted: {TMP_FILE}")


def download_files_with_aria():
    """
    Download files from a public Backblaze B2 bucket served via a custom endpoint using aria2c.
    """
    logging.info("Starting download process with aria2c...")

    # # List of files to download
    file_list = [
        "tornet_2013.tar.gz",
        "tornet_2014.tar.gz",
        "tornet_2015.tar.gz",
        "tornet_2016.tar.gz",
        "tornet_2017.tar.gz",
        "tornet_2018.tar.gz",
        "tornet_2019.tar.gz",
        "tornet_2020.tar.gz",
        "tornet_2021.tar.gz",
        "tornet_2022.tar.gz",
    ]

    # Construct the public URLs
    links = [f"https://{CUSTOM_ENDPOINT}/{APP}/{file_name}" for file_name in file_list]
    # links = [
    #     "https://zenodo.org/records/12655719/files/tornet_2022.tar.gz",
    #     "https://zenodo.org/records/12655718/files/tornet_2021.tar.gz",
    #     "https://zenodo.org/records/12655717/files/tornet_2020.tar.gz",
    #     "https://zenodo.org/records/12655716/files/tornet_2019.tar.gz",
    #     "https://zenodo.org/records/12655187/files/tornet_2018.tar.gz",
    #     "https://zenodo.org/records/12655183/files/tornet_2017.tar.gz",
    #     "https://zenodo.org/records/12655179/files/tornet_2016.tar.gz",
    #     "https://zenodo.org/records/12655151/files/tornet_2015.tar.gz",
    #     "https://zenodo.org/records/12637032/files/tornet_2014.tar.gz",
    #     "https://zenodo.org/records/12636522/files/tornet_2013.tar.gz",
    #     "https://zenodo.org/records/12636522/files/catalog.csv",
    # ]
    
    # Filter out already downloaded files
    links_to_download = [
        link for link in links
        if not os.path.exists(os.path.join(DATA_DIR, os.path.basename(link)))
    ]

    if links_to_download:
        download_links(links_to_download)
    else:
        logging.info("All files already downloaded.")


def extract_local_tar_files():
    """
    Extract all .tar.gz files from the local DATA_DIR to EXTRACT_DIR.
    """
    logging.info("Starting extraction process...")
    for file_name in os.listdir(DATA_DIR):
        if file_name.endswith('.tar.gz'):
            file_path = os.path.join(DATA_DIR, file_name)
            logging.info(f'Extracting {file_path}...')
            with tarfile.open(file_path, 'r:gz') as tar:
                tar.extractall(path=EXTRACT_DIR)
            logging.info(f'Extracted {file_path} to {EXTRACT_DIR}')


if DOWNLOAD_DATA:
    download_files_with_aria()

# Call the function to process the local .tar.gz files
extract_local_tar_files()

2024-12-04 15:31:28,772 - INFO - Starting download process with aria2c...
2024-12-04 15:31:28,860 - INFO - Temporary file created: ./data/tmp.txt
2024-12-04 15:31:28,861 - INFO - Starting downloads for links: https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2013.tar.gz, https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2014.tar.gz, https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2015.tar.gz, https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2016.tar.gz, https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2017.tar.gz, https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2018.tar.gz, https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2019.tar.gz, https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2020.tar.gz, https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2021.tar.gz, https://bbproxy.meyerstk.com/file/TorNetBecauseZenodoSlow/tornet_2022.tar.gz



12/04 15:31:28 [[1;36mINFO[0m] <<--- --- --- ---

12/04 15:31:28 [[1;36mINFO[0m]   --- --- --- ---

12/04 15:31:28 [[1;36mINFO[0m]   --- --- --- --->>

12/04 15:31:28 [[1;36mINFO[0m] aria2 1.37.0

12/04 15:31:28 [[1;36mINFO[0m] gcc 11.2.1 20211120
  built by  x86_64-pc-linux-gnu
  targeting x86_64-pc-linux-musl
  on        Nov 27 2023 08:29:14

12/04 15:31:28 [[1;36mINFO[0m] Linux 5.15.167.4-microsoft-standard-WSL2 #1 SMP Tue Nov 5 00:21:55 UTC 2024 x86_64

12/04 15:31:28 [[1;36mINFO[0m] zlib/1.3.0.zlib-ng libxml2/2.12.1 sqlite3/3.44.1 OpenSSL/3.2.0 c-ares/1.22.1 libssh2/1.11.0

12/04 15:31:28 [[1;36mINFO[0m] Logging started.

12/04 15:31:28 [[1;36mINFO[0m] Checking configured addresses

12/04 15:31:28 [[1;36mINFO[0m] Not considered: 127.0.0.1

12/04 15:31:28 [[1;36mINFO[0m] Found configured address: 172.21.0.2

12/04 15:31:28 [[1;36mINFO[0m] Not considered: ::1

12/04 15:31:28 [[1;36mINFO[0m] Not considered: fe80::42:acff:fe15:2%eth0

12/04 15:31:28 [[1;36m

In [2]:
import os
import xarray as xr
import numpy as np

VARIABLES = ['DBZ', 'VEL', 'KDP', 'RHOHV', 'ZDR', 'WIDTH']
TIME_STEPS = 3
SWEEPS = 2
IMAGE_HEIGHT = 120
IMAGE_WIDTH = 240

def load_nc_file(file_path):
    """
    Load a single NetCDF file and extract the required data and label.
    """
    with xr.open_dataset(file_path, engine="netcdf4") as ds:
        data = np.empty((IMAGE_HEIGHT, IMAGE_WIDTH, len(VARIABLES) * TIME_STEPS * SWEEPS), dtype=np.float32)
        idx = 0
        for t in range(TIME_STEPS):
            for s in range(SWEEPS):
                for var in VARIABLES:
                    channel_data = ds[var].isel(time=t, sweep=s).values
                    np.nan_to_num(channel_data, copy=False, nan=0, posinf=0, neginf=0)
                    data[:, :, idx] = channel_data
                    idx += 1
        label = 1 if 'TOR' in os.path.basename(file_path) else 0
    return data, label

def create_dataset(directory):
    """
    Load all NetCDF files from a directory and return arrays of data and labels.
    """
    data, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.nc'):
                img, label = load_nc_file(os.path.join(root, file))
                data.append(img)
                labels.append(label)
    return np.array(data), np.array(labels)

# Dataset paths
TRAIN_DIR = os.path.join(EXTRACT_DIR, "train")
TEST_DIR = os.path.join(EXTRACT_DIR, "test")

# Load datasets
X_train, y_train = create_dataset(TRAIN_DIR)
X_test, y_test = create_dataset(TEST_DIR)

# Print dataset shapes
print(f"Train data shape: {X_train.shape}, Train labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}, Test labels shape: {y_test.shape}")

Train data shape: (0,), Train labels shape: (0,)
Test data shape: (0,), Test labels shape: (0,)


In [None]:
from tensorflow.keras import models, layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.regularizers import l2

# Model Definition
def create_torcnn(input_shape=(120, 240, 36), dropout_rate=0.3):
    """
    Define the CNN model for tornado detection.
    """
    model = models.Sequential([
        # Block 1
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(dropout_rate),

        # Block 2
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(dropout_rate),

        # Block 3
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(dropout_rate),

        # Block 4
        layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(dropout_rate),

        # Fully Connected Layers
        layers.Flatten(),
        layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        layers.BatchNormalization(),
        layers.Dropout(0.4),

        # Output Layer
        layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss=BinaryCrossentropy(),
        metrics=['accuracy', Precision(), Recall(), AUC()]
    )
    return model

# Create Model
input_shape = (120, 240, len(VARIABLES) * TIME_STEPS * SWEEPS)
model = create_torcnn(input_shape=input_shape)

In [None]:
BATCH_SIZE = 32
EPOCHS = 10

print("Starting model training...")
history = model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_test, y_test)
)

In [None]:
print("Evaluating the model...")
results = model.evaluate(X_test, y_test)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")