### Downloading the data

In [1]:
# Imports

import os
import zipfile
import requests
import json
import pandas as pd
import dask.dataframe as dd
from urllib.request import urlretrieve

In [2]:
%load_ext memory_profiler

In [3]:
# Necessary constants

article_id = 14096681  
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "../data/"
combined_data_path = f"{output_directory}/combined_data.csv"

In [3]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  
files = data["files"]             
files

[{'id': 26579150,
  'name': 'daily_rainfall_2014.png',
  'size': 58863,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e'},
 {'id': 26579171,
  'name': 'environment.yml',
  'size': 192,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34'},
 {'id': 26586554,
  'name': 'README.md',
  'size': 5422,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c'},
 {'id': 26766812,
  'name': 'data.zip',
  'size': 814041183,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26766812',
  'supplied_md5': 'b517383f76e77bd03755a63a8f

In [4]:
files_to_dl = ["data.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

In [5]:
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

### Peak at one of the csv files



In [4]:
df = pd.read_csv("../data/MPI-ESM-1-2-HAM_daily_rainfall_NSW.csv")

In [5]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13


In [6]:
df.shape

(966420, 6)

In [6]:
# Define a skeleton for concatenating data frames

combined_df = pd.DataFrame({
    "time": [], 
    "lat_min": [], 
    "lat_max": [], 
    "lon_min": [], 
    "lon_max": [], 
    "rain (mm/day)": [], 
    "model": []
})





### Concatenate all CSV files

In [7]:
%%time


for filename in os.listdir(output_directory):

    if filename.endswith('.csv'):
        if filename == "observed_daily_rainfall_SYD.csv":
            continue
        else:
            model = filename.partition('_daily_rainfall')[0]
            df = pd.read_csv(output_directory + filename)
            df["model"] = model
            combined_df = pd.concat([combined_df, df], axis=0)


CPU times: user 53.7 s, sys: 48 s, total: 1min 41s
Wall time: 2min 49s


### Check if all csv files are incorporated as model column

In [8]:
# Sanity check - there should be 27 different models

combined_df['model'].value_counts()

MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-ESM2           3541230
CMCC-CM2-SR5        3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64

### Store the combined df to a csv file

In [9]:
%%time

combined_df.to_csv(combined_data_path)

CPU times: user 6min 13s, sys: 6.95 s, total: 6min 20s
Wall time: 6min 32s


In [10]:
%%sh

du -sh ../data/combined_data.csv

6.0G	../data/combined_data.csv


**Note**: Time taken is calculated by combining the wall times of data concatenation and storing the data frame as a CSV file.

| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Cici        |                  |     |           |        |            |
| Gautham     |Mac OS X (Big Sur)|8 GB |   M1      |   Yes  |   9min 21s |
| Navya       |                  |     |           |        |            |
| Nobby       |                  |     |           |        |            |

### 3. Load the combined CSV to memory and perform a simple EDA

#### Loading the entire data frame (Reference)

In [17]:
%%time
%%memit

rainfall_df = pd.read_csv(combined_data_path)
print(rainfall_df["model"].value_counts())

MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-ESM2           3541230
CMCC-CM2-SR5        3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
peak memory: 5864.23 MiB, increment: 5351.78 MiB
CPU times: user 51.1 s, sys: 23.8 s, total: 1min 14s
Wall time: 2min 52s


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Cici        |                  |     |           |        |            |
| Gautham     |Mac OS X (Big Sur)|8 GB |   M1      |   Yes  | 2min 52s   |
| Navya       |                  |     |           |        |            |
| Nobby       |                  |     |           |        |            |

#### Approach 1: Changing `dtype` of the data

In [18]:
%%time
%%memit

print(f"Memory usage with float64: {rainfall_df.memory_usage().sum() / 1e6:.2f} MB")
rainfall_df = rainfall_df.astype(
    dtype={
        "lat_min": "float32",
        "lat_max": "float32",
        "lon_min": "float32",
        "lon_max": "float32",
        "rain (mm/day)": "float32",
        "time": "datetime64[s]"
    },
    errors='ignore')
print(f"Memory usage with float32: {rainfall_df.memory_usage().sum() / 1e6:.2f} MB")
print(rainfall_df["model"].value_counts())

Memory usage with float64: 3997.94 MB
Memory usage with float32: 2748.59 MB
MPI-ESM1-2-HR       5154240
CMCC-CM2-HR4        3541230
CMCC-ESM2           3541230
CMCC-CM2-SR5        3541230
NorESM2-MM          3541230
TaiESM1             3541230
SAM0-UNICON         3541153
GFDL-ESM4           3219300
FGOALS-f3-L         3219300
GFDL-CM4            3219300
MRI-ESM2-0          3037320
EC-Earth3-Veg-LR    3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM4-8           1609650
INM-CM5-0           1609650
FGOALS-g3           1287720
KIOST-ESM           1287720
AWI-ESM-1-1-LR       966420
MPI-ESM1-2-LR        966420
NESM3                966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
peak memory: 4812.47 MiB, increment: 3947.69 MiB
CPU times: user 15.8 s, sys: 5.98 s, total: 21.7 s
Wall time: 45.3 s


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Cici        |                  |     |           |        |            |
| Gautham     |Mac OS X (Big Sur)|8 GB |   M1      |   Yes  |   45.3s    |
| Navya       |                  |     |           |        |            |
| Nobby       |                  |     |           |        |            |

#### Approach 2: Load data in chunks

In [15]:
%%time
%%memit

counts = pd.Series(dtype=int)
for chunk in pd.read_csv(combined_data_path, chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)

print(counts.astype(int))

ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
AWI-ESM-1-1-LR       966420
BCC-CSM2-MR         3035340
BCC-ESM1             551880
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
CanESM5              551880
EC-Earth3-Veg-LR    3037320
FGOALS-f3-L         3219300
FGOALS-g3           1287720
GFDL-CM4            3219300
GFDL-ESM4           3219300
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
MIROC6              2070900
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-HR       5154240
MPI-ESM1-2-LR        966420
MRI-ESM2-0          3037320
NESM3                966420
NorESM2-LM           919800
NorESM2-MM          3541230
SAM0-UNICON         3541153
TaiESM1             3541230
dtype: int64
peak memory: 2356.17 MiB, increment: 698.39 MiB
CPU times: user 49.2 s, sys: 4.12 s, total: 53.3 s
Wall time: 59 s


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Cici        |                  |     |           |        |            |
| Gautham     |Mac OS X (Big Sur)|8 GB |   M1      |   Yes  |   59s      |
| Navya       |                  |     |           |        |            |
| Nobby       |                  |     |           |        |            |

#### Approach 3: Dask

In [16]:
%%time
%%memit

rainfall_ddf = dd.read_csv(combined_data_path)
print(rainfall_ddf["model"].value_counts().compute())

MPI-ESM1-2-HR       5154240
TaiESM1             3541230
NorESM2-MM          3541230
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
SAM0-UNICON         3541153
FGOALS-f3-L         3219300
GFDL-CM4            3219300
GFDL-ESM4           3219300
EC-Earth3-Veg-LR    3037320
MRI-ESM2-0          3037320
BCC-CSM2-MR         3035340
MIROC6              2070900
ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
INM-CM5-0           1609650
INM-CM4-8           1609650
KIOST-ESM           1287720
FGOALS-g3           1287720
MPI-ESM1-2-LR        966420
NESM3                966420
AWI-ESM-1-1-LR       966420
MPI-ESM-1-2-HAM      966420
NorESM2-LM           919800
BCC-ESM1             551880
CanESM5              551880
Name: model, dtype: int64
peak memory: 4687.06 MiB, increment: 2399.27 MiB
CPU times: user 33.2 s, sys: 11.1 s, total: 44.3 s
Wall time: 23.3 s


| Team Member | Operating System | RAM | Processor | Is SSD | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|
| Cici        |                  |     |           |        |            |
| Gautham     |Mac OS X (Big Sur)|8 GB |   M1      |   Yes  |   23.3s    |
| Navya       |                  |     |           |        |            |
| Nobby       |                  |     |           |        |            |