### Downloading the data

In [1]:
#Imports

import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

In [2]:
# Necessary metadata
article_id = 14096681  
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "../data/"

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  
files = data["files"]             
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [4]:
# files_to_dl = ["data.zip"]
# for file in files:
#     if file["name"] in files_to_dl:
#         os.makedirs(output_directory, exist_ok=True)
#         urlretrieve(file["download_url"], output_directory + file["name"])

In [5]:
# with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
#     f.extractall(output_directory)

## Combine all CSVs



In [4]:
df = pd.read_csv("../data/MPI-ESM-1-2-HAM_daily_rainfall_NSW.csv")

In [5]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13


In [6]:
df.shape

(966420, 6)

In [7]:
# define a skeleton for concatenating df 

combined_df = pd.DataFrame({"time": [], 
                              "lat_min": [], 
                              "lat_max": [], 
                              "lon_min": [], 
                              "lon_max": [], 
                              "rain (mm/day)": [], 
                              "model": []})





In [16]:
%%time


for filename in os.listdir(output_directory):

    if filename.endswith('.csv'):
        if filename == "observed_daily_rainfall_SYD.csv":
            continue
        else:

            model = filename.partition('_daily_rainfall')[0]
            df = pd.read_csv(output_directory + filename)
            df["model"] = model
            combined_df = pd.concat([combined_df, df], axis=0)


CPU times: user 2min 24s, sys: 2min 44s, total: 5min 8s
Wall time: 6min 58s


In [None]:

#sanity check: there should be 27 different models
combined_df['model'].value_counts()

combined_data.csv    58311370
MPI-ESM1-2-HR        10308480
CMCC-CM2-HR4          7082460
NorESM2-MM            7082460
TaiESM1               7082460
CMCC-ESM2             7082460
CMCC-CM2-SR5          7082460
SAM0-UNICON           7082306
GFDL-CM4              6438600
GFDL-ESM4             6438600
FGOALS-f3-L           6438600
MRI-ESM2-0            6074640
EC-Earth3-Veg-LR      6074640
BCC-CSM2-MR           6070680
MIROC6                4141800
ACCESS-CM2            3865680
ACCESS-ESM1-5         3221400
INM-CM5-0             3219300
INM-CM4-8             3219300
KIOST-ESM             2575440
FGOALS-g3             2575440
AWI-ESM-1-1-LR        1932840
MPI-ESM1-2-LR         1932840
NESM3                 1932840
MPI-ESM-1-2-HAM       1932840
NorESM2-LM            1839600
BCC-ESM1              1103760
CanESM5               1103760
Name: model, dtype: int64

In [None]:
%%time

combined_df.to_csv(output_directory + "combined_data.csv")