# Download GHCNd Data

This downloads the Global Historical Climatology Network daily (GHCNd). This downloads the required data for the 1.2 Surface Temperature indicators and 1.3 Rainfall indicators

### Description from NOAA:

The Global Historical Climatology Network daily (GHCNd) is an integrated database of daily climate summaries from land surface stations across the globe. GHCNd is made up of daily climate records from numerous sources that have been integrated and subjected to a common suite of quality assurance reviews.

GHCNd contains records from more than 100,000 stations in 180 countries and territories. NCEI provides numerous daily variables, including maximum and minimum temperature, total daily precipitation, snowfall, and snow depth. About half the stations only report precipitation. Both record length and period of record vary by station and cover intervals ranging from less than a year to more than 175 years.


Further information can be found at https://www.ncei.noaa.gov/products/land-based-station/global-historical-climatology-network-daily

In [1]:
import os
import urllib 
import pandas as pd

In [36]:
import requests
from bs4 import BeautifulSoup
import re

In [34]:
GHCND_dir = "https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/"

In [35]:
download_folder = './GHCNd/'
if not os.path.exists(download_folder):
    os.makedirs(download_folder)


In [37]:
r = requests.get(GHCND_dir, headers={'User-Agent': 'Chrome'})

In [38]:
html = r.text

In [40]:
cleantext = BeautifulSoup(html)

In [42]:
recent_date = cleantext.find_all(["a"])

In [44]:
GHCND_files = []

for x in BeautifulSoup(r.text).find_all(['a'])[5:]:
    GHCND_files.append(re.findall(r'"([^"]*)"', str(x))[0])

In [46]:
for palau_file in [x for x in GHCND_files if x[:2] == "PS"]:
    print(palau_file)
    urllib.request.urlretrieve(GHCND_dir + palau_file, download_folder + palau_file)

PSC00914015.csv
PSC00914030.csv
PSC00914465.csv
PSC00914478.csv
PSC00914519.csv
PSC00914580.csv
PSC00914712.csv
PSC00914840.csv
PSC00914913.csv
PSW00040305.csv
PSW00040307.csv
PSW00040309.csv


In [58]:
palau_dict = {}
for x in os.listdir(download_folder):
    print(pd.read_csv(download_folder+x)['NAME'].unique()[0], len(pd.read_csv(download_folder+x)['NAME'].unique()))
    palau_dict[x] = pd.read_csv(download_folder+x)['NAME'].unique()[0]
    

  print(pd.read_csv(download_folder+x)['NAME'].unique()[0], len(pd.read_csv(download_folder+x)['NAME'].unique()))
  print(pd.read_csv(download_folder+x)['NAME'].unique()[0], len(pd.read_csv(download_folder+x)['NAME'].unique()))


KOROR, PW PS 1
MALAKAL RES, PW PS 1
TOBI, UM US 1
PELELIU, PW PS 1
AIMELIK BABELTHUAP, PW PS 1


  palau_dict[x] = pd.read_csv(download_folder+x)['NAME'].unique()[0]


ANGUAR, PW PS 1
NGASANG BABELTHUAP, PW PS 1
WEATHER SERVICE OFFICE PALAU, PW PS 1
MARICULTURE CENTER, PW PS 1
NEKKEN FORESTRY, PW PS 1
PELELIU ISLAND PALAU ISLANDS, PW PS 1
KOROR ISLAND NF, PW PS 1


In [57]:
palau_dict

{'PSW00040309.csv': 'KOROR, PW PS',
 'PSC00914465.csv': 'MALAKAL RES, PW PS',
 'PSC00914840.csv': 'TOBI, UM US',
 'PSC00914712.csv': 'PELELIU, PW PS',
 'PSC00914015.csv': 'AIMELIK BABELTHUAP, PW PS',
 'PSC00914030.csv': 'ANGUAR, PW PS',
 'PSC00914580.csv': 'NGASANG BABELTHUAP, PW PS',
 'PSC00914913.csv': 'WEATHER SERVICE OFFICE PALAU, PW PS',
 'PSC00914478.csv': 'MARICULTURE CENTER, PW PS',
 'PSC00914519.csv': 'NEKKEN FORESTRY, PW PS',
 'PSW00040305.csv': 'PELELIU ISLAND PALAU ISLANDS, PW PS',
 'PSW00040307.csv': 'KOROR ISLAND NF, PW PS'}

In [2]:
var_list = ['TMAX','TMIN',"TOBS",
 "DAPR","MDPR","PRCP","SNOW","SNWD",
 "ACMH","ACSH","PSUN","TSUN","WESD",
 "WT01","WT03","WT05","WT07","WT08","WT11","WT14","WT16","WT18","WT20",
"AWND","FMTM","PGTM","WDF1","WDF2","WDF5","WDFG","WDFM","WSF1","WSF2"]

In [3]:
airtemp_list = ['TMAX','TMIN',"TOBS"]
precip_list = ["DAPR","MDPR","PRCP","SNOW","SNWD"]
sky_list = ["ACMH","ACSH"]
sun_list = ["PSUN","TSUN"]
water_list = ["WESD"]
weathertype_list = ["WT01","WT03","WT05","WT07","WT08","WT11","WT14","WT16","WT18","WT20"]
wind_list = ["AWND","FMTM","PGTM","WDF1","WDF2","WDF5","WDFG","WDFM","WSF1","WSF2"]
type_list = [airtemp_list,precip_list,sky_list,sun_list,water_list,weathertype_list,wind_list]
var_names = ["AIRTEMP","PRECIP","SKY","SUN","WATER","WEATHERTYPE","WIND"]

In [69]:
for x in os.listdir(download_folder):
    curr_df = pd.read_csv(download_folder+x)
    palau_dict[x] = [palau_dict[x]] + [x for x in var_list if x in curr_df.columns]

  curr_df = pd.read_csv(download_folder+x)


In [180]:
for x in os.listdir(download_folder):
    curr_df = pd.read_csv(download_folder+x)
    df_cleaned = curr_df.dropna(axis=1, how='all')
#     print(x)
#     print(len(df_cleaned.columns) == len(curr_df.columns))


  curr_df = pd.read_csv(download_folder+x)


In [181]:
base_variables = ["NAME", "STATION" ,"DATE","LATITUDE", "LONGITUDE", "ELEVATION" ]

In [221]:
pkl_folder = './GHCND/pkl/'
if not os.path.exists(pkl_folder):
    os.makedirs(pkl_folder)

In [207]:
#prints station-variable combinations with comprehensive variables for each category
for l in range(len(type_list)):
    df_list = []
    for x in os.listdir(download_folder):
        curr_df = pd.read_csv(download_folder+x,low_memory=False)
        if all(elem in curr_df.columns for elem in type_list[l]):
            station_name = curr_df[base_variables+type_list[l]]['NAME'][0][:curr_df[base_variables+type_list[l]]['NAME'][0].find(',')].replace(' ','_')
            station_code = curr_df[base_variables+type_list[l]]['STATION'][0]
            df_list.append(curr_df[base_variables+type_list[l]])
            print(var_names[l] + "_" + station_name + "_" + station_code )
            

AIRTEMP_KOROR_PSW00040309
AIRTEMP_AIMELIK_BABELTHUAP_PSC00914015
AIRTEMP_ANGUAR_PSC00914030
AIRTEMP_NGASANG_BABELTHUAP_PSC00914580
AIRTEMP_NEKKEN_FORESTRY_PSC00914519
PRECIP_PELELIU_PSC00914712
PRECIP_AIMELIK_BABELTHUAP_PSC00914015
PRECIP_ANGUAR_PSC00914030
PRECIP_NGASANG_BABELTHUAP_PSC00914580
PRECIP_MARICULTURE_CENTER_PSC00914478
PRECIP_NEKKEN_FORESTRY_PSC00914519
SKY_KOROR_PSW00040309
SUN_KOROR_PSW00040309
WATER_KOROR_PSW00040309
WIND_KOROR_PSW00040309


In [243]:
#prints station-variable combinations with any amount of variable(s) for each category

for l in range(len(type_list)):
    df_list = []
    for x in os.listdir(download_folder):
        if x[-3:]!= "csv":
            continue
        curr_df = pd.read_csv(download_folder+x,low_memory=False)
        if any(elem in curr_df.columns for elem in type_list[l]):
            filled_var = [x for x in curr_df.columns if x in type_list[l] or x in base_variables]
            station_name = curr_df[filled_var]['NAME'][0][:curr_df[filled_var]['NAME'][0].find(',')].replace(' ','_')
            station_code = curr_df[filled_var]['STATION'][0]
            df_list.append(curr_df[filled_var].dropna(thresh=7).reset_index(drop=True))
            file_name = var_names[l] + "_" + station_name + "_" + station_code+ ".pkl"
            print(file_name)
            curr_df[filled_var].dropna(thresh=7).reset_index(drop=True).to_pickle(pkl_folder+file_name)

    var_file_name = var_names[l] + "_PALAU.pkl"
    
    pd.concat(df_list).reset_index(drop=True).to_pickle(pkl_folder+var_file_name)
    print(var_file_name)

AIRTEMP_KOROR_PSW00040309.pkl
AIRTEMP_PELELIU_PSC00914712.pkl
AIRTEMP_AIMELIK_BABELTHUAP_PSC00914015.pkl
AIRTEMP_ANGUAR_PSC00914030.pkl
AIRTEMP_NGASANG_BABELTHUAP_PSC00914580.pkl
AIRTEMP_WEATHER_SERVICE_OFFICE_PALAU_PSC00914913.pkl
AIRTEMP_MARICULTURE_CENTER_PSC00914478.pkl
AIRTEMP_NEKKEN_FORESTRY_PSC00914519.pkl
AIRTEMP_PELELIU_ISLAND_PALAU_ISLANDS_PSW00040305.pkl
AIRTEMP_KOROR_ISLAND_NF_PSW00040307.pkl
AIRTEMP_PALAU.pkl
PRECIP_KOROR_PSW00040309.pkl
PRECIP_MALAKAL_RES_PSC00914465.pkl
PRECIP_TOBI_PSC00914840.pkl
PRECIP_PELELIU_PSC00914712.pkl
PRECIP_AIMELIK_BABELTHUAP_PSC00914015.pkl
PRECIP_ANGUAR_PSC00914030.pkl
PRECIP_NGASANG_BABELTHUAP_PSC00914580.pkl
PRECIP_WEATHER_SERVICE_OFFICE_PALAU_PSC00914913.pkl
PRECIP_MARICULTURE_CENTER_PSC00914478.pkl
PRECIP_NEKKEN_FORESTRY_PSC00914519.pkl
PRECIP_PELELIU_ISLAND_PALAU_ISLANDS_PSW00040305.pkl
PRECIP_KOROR_ISLAND_NF_PSW00040307.pkl
PRECIP_PALAU.pkl
SKY_KOROR_PSW00040309.pkl
SKY_PALAU.pkl
SUN_KOROR_PSW00040309.pkl
SUN_PALAU.pkl
WATER_KOROR_PSW0