In [16]:
import pandas as pd
import requests
import glob
import os

In [8]:
columns = [
  "Station_ID",
  "Station_name",
  "DATE",
  "Latitude",
  "Longitude",
  "Elevation",
  "temperature",
  "wind_speed",
  "relative_humidity",
  "wet_bulb_temperature",
  "altimeter",
  "precipitation"
]

df = pd.read_parquet('./GHCNh_AAI0000TNCA_2024.parquet', engine="pyarrow", columns=columns)

In [9]:
# Print out a list of columns:
for col in df.columns:
  print(col)

# Dump the list of columns to a text file:
# with open('ghcnh_columns.txt', 'w') as f:
#   for col in df.columns:
#     f.write(col + '\n')

# Print out the first 5 rows of the dataset:
print(df.head())

Station_ID
Station_name
DATE
Latitude
Longitude
Elevation
temperature
wind_speed
relative_humidity
wet_bulb_temperature
altimeter
precipitation
    Station_ID        Station_name                 DATE  Latitude  Longitude  \
0  AAI0000TNCA  REINA BEATRIX INTL  2024-01-01T00:00:00   12.5014   -70.0152   
1  AAI0000TNCA  REINA BEATRIX INTL  2024-01-01T01:00:00   12.5014   -70.0152   
2  AAI0000TNCA  REINA BEATRIX INTL  2024-01-01T02:00:00   12.5014   -70.0152   
3  AAI0000TNCA  REINA BEATRIX INTL  2024-01-01T03:00:00   12.5014   -70.0152   
4  AAI0000TNCA  REINA BEATRIX INTL  2024-01-01T04:00:00   12.5014   -70.0152   

   Elevation  temperature  wind_speed  relative_humidity  \
0       18.3         28.0         7.7                 74   
1       18.3         28.0         7.2                 74   
2       18.3         28.0         7.7                 74   
3       18.3         28.0         8.2                 70   
4       18.3         28.0         9.3                 74   

   wet_bulb_te

0       2024-01-01T00:00:00
1       2024-01-01T01:00:00
2       2024-01-01T02:00:00
3       2024-01-01T03:00:00
4       2024-01-01T04:00:00
               ...         
8114    2024-12-05T17:00:00
8115    2024-12-05T18:00:00
8116    2024-12-05T19:00:00
8117    2024-12-05T20:00:00
8118    2024-12-05T21:00:00
Name: DATE, Length: 8119, dtype: object

In [15]:
# https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-year/2023/parquet/GHCNh_AGM00060371_2023.parquet
base_url = "https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-year/{}/parquet/{}"

year = 2023
station_id = "AGM00060371"
download_filename = f"GHCNh_{station_id}_{year}.parquet"
output_filename = f"./ghcn_hourly_data/GHCNh_{station_id}_{year}.parquet"

# Download the file from the NOAA website
url = base_url.format(year, download_filename)
print(url)
response = requests.get(url)

# Save the file to the local directory
with open(output_filename, 'wb') as f:
  f.write(response.content)

https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/access/by-year/2023/parquet/GHCNh_AGM00060371_2023.parquet


In [21]:
# Get a list of all station IDs
index_folder = "./ghcn-hourly_v1.0.0_d2023_c20240709-inv"
index_files = glob.glob(os.path.join(index_folder, "*.psv"))
index_files

station_ids = []
for filepath in index_files:
  station_id = os.path.basename(filepath).split(".")[0].split("_")[1]
  print(station_id)
  station_ids.append(station_id)

station_ids

with open("ghcn_station_ids.txt", "w") as f:
  for station_id in station_ids:
    f.write(station_id + "\n")

EZI0000LKVO
AUM00011389
ITI0000LIMK
USL000TKEA2
USW00023122
USW00024285
CHM00058527
SWI0000ESMQ
INM00042450
NLI0000EHAK
USL000TOKW1
USW00014916
MOI0000GMMZ
JAI0000RJAW
USW00024284
TUI0000LTFM
ITI0000LIRG
HUMU0012772
KEM00063686
THM00048334
USL000OKXC1
IRM00040872
AYM00089011
TSM00060772
EGM00062435
USW00000415
USW00054743
KZM00036639
LAM00048953
USW00053874
TXI0000UTAV
FII0000EFHA
FRI0000LFOF
RQC00663657
USW00054742
NLM00006348
PLI0000EPWI
FGI0000SOCA
SWM00002019
AUM00011213
USW00000141
SWI0000ESPA
TUM00017238
USW00000327
BRI0000SBBR
ASN00048027
USW00022016
USW00004129
KZM00038112
USW00000326
USW00063844
USW00064773
CAW00027102
USW00000140
CAN01145297
USW00004128
CAN01037090
CHM00045039
JAI0000RJCK
RSM00034922
USI0000KEDN
BRM00086802
INM00042104
SPM00008072
USI0000KHFY
USW00004977
USL000OKSI2
EKI0000FGSL
SZM00006644
NOM00001387
USI0000KXNX
USW00004781
INI0000VOGB
BRM00081734
FMM00091329
USL000VCAF1
SPM00008215
BRM00086803
USI0000K3F3
FRM00007770
USW00025518
NOM00001386
SZM00006645
USW0