In [1]:
import os
import requests

from datetime import datetime
from tqdm import tqdm

from pyspark import SparkFiles
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("Irish Marine Institute - Data Prep").getOrCreate()

In [2]:
dataset_dir = 'dataset'
dataset_base_url = "https://erddap.marine.ie/erddap/tabledap/"
dataset_ids = ["allDatasets", "IrishNationalTideGaugeNetwork", "IMI-TidePrediction", "IWaveBNetwork30Min", "IWaveBNetwork_spectral"]
dataset_format = "csv"

t_0 = datetime(2003, 1, 1)
t_1 = datetime(2024, 1, 1)

In [3]:
def fetch_data(dataset_id, file_format, keys = None, t_range = None, orderby = None, distinct = False):
    os.makedirs(dataset_dir, exist_ok=True)
    file_name = f"{dataset_id}{'_meta' if distinct else ''}.{file_format}"
    file_path = os.path.join(dataset_dir, file_name)
    
    keys_str = ','.join(keys) if keys else ''
    orderby_str = f"&orderBy(\"{','.join(orderby)}\")" if orderby else ''
    t_range_str = (f"&time>={t_range[0].isoformat()}" if t_range else '') + (f"&time<={t_range[1].isoformat()}" if t_range and len(t_range) == 2 else '')
    
    if os.path.isfile(file_path):
        print(f"File {file_path} already exists")
    else:
        print(f"Downloading {file_path} ...")
        url = f"{dataset_base_url}{dataset_id}.{file_format}?{keys_str}{'&distinct()' if distinct else ''}{t_range_str}{orderby_str}"
        res = requests.get(url, allow_redirects=True)
        with open(file_path, 'wb') as f:
            f.write(res.content)
    
    return file_path

In [4]:
# fetch data
data_paths = []
for dataset_id in tqdm(dataset_ids):
    if dataset_id == "allDatasets":
        data_paths.append(fetch_data("allDatasets", dataset_format, None, None, ["institution", "datasetID", "title"], True))
    else:
        s_id = "stationID" if dataset_id == 'IMI-TidePrediction' else "station_id"
        data_paths.append(fetch_data(dataset_id, dataset_format, ["longitude", "latitude", s_id], None, None, True))
        data_paths.append(fetch_data(dataset_id, dataset_format, None, [t_0, t_1], None))

  0%|          | 0/5 [00:00<?, ?it/s]

Downloading dataset\allDatasets_meta.csv ...


 20%|██        | 1/5 [00:00<00:01,  2.88it/s]

Downloading dataset\IrishNationalTideGaugeNetwork_meta.csv ...
Downloading dataset\IrishNationalTideGaugeNetwork.csv ...


 40%|████      | 2/5 [02:40<04:43, 94.43s/it]

Downloading dataset\IMI-TidePrediction_meta.csv ...
Downloading dataset\IMI-TidePrediction.csv ...


 60%|██████    | 3/5 [02:55<01:55, 57.89s/it]

Downloading dataset\IWaveBNetwork30Min_meta.csv ...
Downloading dataset\IWaveBNetwork30Min.csv ...


 80%|████████  | 4/5 [04:15<01:06, 66.66s/it]

Downloading dataset\IWaveBNetwork_spectral_meta.csv ...
Downloading dataset\IWaveBNetwork_spectral.csv ...


100%|██████████| 5/5 [04:22<00:00, 52.49s/it]


In [5]:
for data_path in data_paths:
    spark.sparkContext.addFile(data_path)

In [6]:
df = spark.read.csv(SparkFiles.get(os.path.join(os.getcwd(), 'dataset/allDatasets_meta.csv')), header=True, inferSchema=True)

In [7]:
df.printSchema()

root
 |-- datasetID: string (nullable = true)
 |-- accessible: string (nullable = true)
 |-- institution: string (nullable = true)
 |-- dataStructure: string (nullable = true)
 |-- cdm_data_type: string (nullable = true)
 |-- class: string (nullable = true)
 |-- title: string (nullable = true)
 |-- minLongitude: string (nullable = true)
 |-- maxLongitude: string (nullable = true)
 |-- longitudeSpacing: string (nullable = true)
 |-- minLatitude: string (nullable = true)
 |-- maxLatitude: string (nullable = true)
 |-- latitudeSpacing: string (nullable = true)
 |-- minAltitude: string (nullable = true)
 |-- maxAltitude: string (nullable = true)
 |-- minTime: string (nullable = true)
 |-- maxTime: string (nullable = true)
 |-- timeSpacing: string (nullable = true)
 |-- griddap: string (nullable = true)
 |-- subset: string (nullable = true)
 |-- tabledap: string (nullable = true)
 |-- MakeAGraph: string (nullable = true)
 |-- sos: string (nullable = true)
 |-- wcs: string (nullable = true)


In [8]:
df.show(vertical=True)

-RECORD 0--------------------------------
 datasetID        | null                 
 accessible       | null                 
 institution      | null                 
 dataStructure    | null                 
 cdm_data_type    | null                 
 class            | null                 
 title            | null                 
 minLongitude     | degrees_east         
 maxLongitude     | degrees_east         
 longitudeSpacing | degrees_east         
 minLatitude      | degrees_north        
 maxLatitude      | degrees_north        
 latitudeSpacing  | degrees_north        
 minAltitude      | m                    
 maxAltitude      | m                    
 minTime          | UTC                  
 maxTime          | UTC                  
 timeSpacing      | seconds              
 griddap          | null                 
 subset           | null                 
 tabledap         | null                 
 MakeAGraph       | null                 
 sos              | null          