In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


GC Net Data

In [None]:
!pip install git+https://github.com/GEUS-PROMICE/pyNEAD.git

In [27]:
#!/usr/bin/env bash

# Latest L1 data: https://github.com/GEUS-Glaciology-and-Climate/GC-Net-level-1-data-processing/tree/main/L1
# API contents of latest L1 data (raw URLs etc.): https://api.github.com/repositories/319306521/contents/L1

import os
#os.chdir('/content/drive/MyDrive/Master_Thesis')
os.chdir("../")
print(os.getcwd())


try:
    os.mkdir("data")
    os.mkdir("data/data_daily")
    os.mkdir("data/data_hourly")
except:
    print('Overwritting existing data in "/data"')

import urllib.request

# Download data
print("Downloading daily data...\r")


# xargs -n 1 curl --silent -O --output-dir data_daily < ../metadata/urls_1.txt
for url in open("metadata/urls_1.txt"):
    # Split on the rightmost / and take everything on the right side of that
    name = url.rsplit("/", 1)[-1].replace("\r", "")
    # Strip /n at the end of filename
    name = name.strip()
    # Combine the name and the downloads directory to get the local filename
    filename = os.path.join("data/data_daily", name)

    # Download the file if it does not exist
    if not os.path.isfile(filename):
        urllib.request.urlretrieve(url, filename)


/content/drive/MyDrive/Master_Thesis
Overwritting existing data in "/data"
Downloading daily data...


In [28]:
print("Downloading hourly data...\r")
# xargs -n 1 curl --silent -O --output-dir data_hourly < ../metadata/urls_2.txt
for url in open("metadata/urls_2.txt"):
    # Split on the rightmost / and take everything on the right side of that
    name = url.rsplit("/", 1)[-1].replace("\r", "")
    # Strip /n at the end of filename
    name = name.strip()
    # Combine the name and the downloads directory to get the local filename
    filename = os.path.join("data/data_hourly", name)

    # Download the file if it does not exist
    if not os.path.isfile(filename):
        urllib.request.urlretrieve(url, filename)

Downloading hourly data...


In [29]:
# Process data
# echo -ne 'Processing dairly data...\r'
# python scripts/process_data_daily.py
# echo -ne 'Processing hourly data...\r'
# python scripts/process_data_hourly.py

# Delete unprocessed data
# rm -r data_daily data_hourly

# %% process data daily
import pandas as pd
import nead

# Convert NEAD files to Pandas dataframes
station = pd.read_csv("metadata/station_info.csv", header=0)
dfs_daily = []

for name, ID in zip(station.Name, station.ID):
    format_name = name.replace(" ", "")
    files = "data/data_daily/" + str(ID).zfill(2) + "-" + format_name + "_daily.csv"
    ds_daily = nead.read(files, index_col=0)
    df_daily = ds_daily.to_dataframe()
    df_daily.insert(
        loc=0, column="station_name", value=name
    )  # Add station_name column to each dataframe
    dfs_daily.append(df_daily)

# Concatenate dataframes
df_daily = pd.concat(dfs_daily).sort_index()

# Delete irrelevant columns from dataframe (i.e. null columns and flag columns)
# null_columns = df_daily.columns[df_daily.isnull().all()]
# flag_columns = df_daily.filter(regex="flag$").columns
# print(null_columns)
# print(flag_columns)

df_daily = df_daily.drop(
    columns=[
        "OSWR_max",
        "HW2_adj_flag",
        "P_adj_flag",
        "HW1_adj_flag",
        "OSWR_adj_flag",
        "HS1_adj_flag",
        "HS2_adj_flag",
        "TA3_adj_flag",
        "TA4_adj_flag",
        "DW1_adj_flag",
    ]
)
# Add season column to dataframe
seasons = {
    1: "Winter",
    2: "Winter",
    3: "Spring",
    4: "Spring",
    5: "Spring",
    6: "Summer",
    7: "Summer",
    8: "Summer",
    9: "Autumn",
    10: "Autumn",
    11: "Autumn",
    12: "Winter",
}

# Extract the month from the index and use the dictionary to map it to the corresponding season
df_daily["season"] = df_daily.index.month.map(seasons)

# Add year column to dataframe
df_daily["year"] = df_daily.index.strftime("%Y")

# Add month column to dataframe
df_daily["month"] = df_daily.index.strftime("%B")

# Rename Index Column to Datetime
df_daily = df_daily.reset_index(inplace=False)
df_daily = df_daily.rename(columns={'timestamp': 'Datetime'}, inplace=False) 

# Rename 'station_name' to file
df_daily = df_daily.rename(columns={'station_name': 'file'}, inplace=False)

# Add Day of Year & Day of Century 
df_daily['DayOfYear'] = df_daily['Datetime'].dt.dayofyear 
df_daily['DayOfCentury'] = df_daily['Datetime'].dt.dayofyear+365*(df_daily['Datetime'].dt.year-1)


# # Add day column to dataframe
# df_daily["day"] = df_daily.index.strftime("%d")

# Add hour column to dataframe
# df_daily["hour"] = df_daily.index.strftime("%h")

# Change column headers
#header = pd.read_csv('/content/drive/MyDrive/Master_Thesis/metadata/Masterdata_GCNET.csv', sep = ";")
header = pd.read_csv('metadata/Masterdata_GCNET.csv', sep = ";")
df_daily = df_daily.rename(columns = header.set_index('fields')['display_description'])

# Save dataframe as parquet file
df_daily.to_parquet("data/df_daily.gzip", compression="gzip")

# %% process data hourly
# Convert NEAD files to Pandas dataframes
station = pd.read_csv("metadata/station_info.csv", header=0)
dfs_hourly = []

for name, ID in zip(station.Name, station.ID):
    format_name = name.replace(" ", "")
    files = "data/data_hourly/" + str(ID).zfill(2) + "-" + format_name + ".csv"
    ds_hourly = nead.read(files, index_col=0)
    df_hourly = ds_hourly.to_dataframe()
    df_hourly.insert(
        loc=0, column="station_name", value=name
    )  # Add station_name column to each dataframe
    dfs_hourly.append(df_hourly)

# Concatenate dataframes
df_hourly = pd.concat(dfs_hourly).sort_index()

# Delete irrelevant columns from dataframe (i.e. null columns and flag columns)
# null_columns = df_hourly.columns[df_hourly.isnull().all()]
# flag_columns = df_hourly.filter(regex="flag$").columns
# print(null_columns)
# print(flag_columns)

df_hourly = df_hourly.drop(
    columns=[
        "OSWR_max",
        "HW2_adj_flag",
        "P_adj_flag",
        "HW1_adj_flag",
        "OSWR_adj_flag",
        "HS1_adj_flag",
        "HS2_adj_flag",
        "TA3_adj_flag",
        "TA4_adj_flag",
        "DW1_adj_flag",
    ]
)
# Add year column to dataframe
df_hourly["year"] = df_hourly.index.strftime("%Y")

# Add month column to dataframe
df_hourly["month"] = df_hourly.index.strftime("%B")

# Add season column to dataframe
seasons = {
    1: "Winter",
    2: "Winter",
    3: "Spring",
    4: "Spring",
    5: "Spring",
    6: "Summer",
    7: "Summer",
    8: "Summer",
    9: "Autumn",
    10: "Autumn",
    11: "Autumn",
    12: "Winter",
}

# Extract the month from the index and use the dictionary to map it to the corresponding season
df_hourly["season"] = df_hourly.index.month.map(seasons)


#Rename Index Column to Datetime
df_hourly = df_hourly.reset_index(inplace=False)
df_hourly = df_hourly.rename(columns={'timestamp': 'Datetime'}, inplace=False) 

# Rename 'station_name' to file
df_hourly = df_hourly.rename(columns={'station_name': 'file'}, inplace=False)

# Add Day of Year & Day of Century 
df_hourly['DayOfYear'] = df_hourly['Datetime'].dt.dayofyear 
df_hourly['DayOfCentury'] = df_hourly['Datetime'].dt.dayofyear+365*(df_hourly['Datetime'].dt.year-1)


# # Add day column to dataframe
# df_hourly["day"] = df_hourly.index.strftime("%d")

# # Add hour column to dataframe
# df_hourly["hour"] = df_hourly.index.strftime("%h")


#header = pd.read_csv('/content/drive/MyDrive/Master_Thesis/metadata/Masterdata_GCNET.csv', sep = ";")
header = pd.read_csv('metadata/Masterdata_GCNET.csv', sep = ";")
df_hourly = df_hourly.rename(columns = header.set_index('fields')['display_description'])

# Save dataframe as parquet file
df_hourly.to_parquet("data/df_hourly.gzip", compression="gzip")

Promice Data

In [None]:
!wget -r -e robots=off -nH --cut-dirs=3 --content-disposition "https://dataverse.geus.dk/api/datasets/:persistentId/dirindex?persistentId=doi:10.22008/FK2/8SS7EW"

In [None]:
# save hourly data

from pathlib import Path
import pandas as pd
import numpy as np

path = r'/content'  # or unix / linux / mac path

# Get the files from the path provided in the OP
files = Path(path).glob('*_hour_v03.txt')  # .rglob to get subdirectories

dfs = list()
for f in files:
    data = pd.read_csv(f, delimiter=r"\s+", engine='python')
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file'] = f.stem
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

df = df.replace(-999, np.nan)

df["Month"] = df["MonthOfYear"]
df["Day"] = df["DayOfMonth"]
df["Hour"] = df["HourOfDay(UTC)"]

df["Datetime"] = pd.to_datetime(df[["Year", "Month", "Day", "Hour"]], format='%Y%m%d%h')

# Save dataframe as parquet file
df.to_parquet("data/promice_hourly.gzip", compression="gzip")
#df.to_parquet("/content/drive/MyDrive/Master_Thesis/data/promice_hourly.gzip", compression="gzip")

In [None]:
# save daily data 
# Get the files from the path provided in the OP
files = Path(path).glob('*_day_v03.txt')  # .rglob to get subdirectories

dfs = list()
for f in files:
    data = pd.read_csv(f, delimiter=r"\s+", engine='python')
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file'] = f.stem
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

df = df.replace(-999, np.nan)

df["Month"] = df["MonthOfYear"]
df["Day"] = df["DayOfMonth"]

df["Datetime"] = pd.to_datetime(df[["Year", "Month", "Day"]], format='%Y%m%d')

# Save dataframe as parquet file
df.to_parquet("data/promice_daily.gzip", compression="gzip")
#df.to_parquet("/content/drive/MyDrive/Master_Thesis/data/promice_daily.gzip", compression="gzip")

In [None]:
# save monthly data 
# Get the files from the path provided in the OP
files = Path(path).glob('*_month_v03.txt')  # .rglob to get subdirectories

dfs = list()
for f in files:
    data = pd.read_csv(f, delimiter=r"\s+", engine='python')
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file'] = f.stem
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

df = df.replace(-999, np.nan)

df["Month"] = df["MonthOfYear"]

df["Datetime"] = pd.to_datetime(['{}-{}-01'.format(y, m) for y, m in zip(df.Year, df.Month)])

# Save dataframe as parquet file
df.to_parquet("data/promice_monthly.gzip", compression="gzip")
#df.to_parquet("/content/drive/MyDrive/Master_Thesis/data/promice_monthly.gzip", compression="gzip")