# SMEAR raw data loader

Loads data from SMEAR API and saves to local .csv files.

No other functionality (such as analysis, preprosessing, etc), the mere idea is to load the data to local files once. This is to reduce time for API queries and to reduce API load. Note that file names are managed in file ```file_config.py``` 

Utilizes loader functions in module ```avaa_api.py``` by pkolari

In [None]:
#Imports

import datetime
import utils.avaa_api as avaa_api
import json
import sys
import pandas as pd
from dateutil.relativedelta import relativedelta
from file_config import FILE_PATHS
from datetime import datetime, timedelta

In [2]:
# Configure path

sys.path.append("../src")  # Ensure src is in Python path if not already


In cell below specify parameters to be downloaded, aggregation method, and data interval. Specify these for each station separately. Also, specify timeout (for API response problems). 

In [None]:
#Configuration

stations = [
  'var',
  'hyy',
  'sii',
  'kum'
]

timeout = 60

## Värriö SMEAR I station
var_tablevars = [
  'VAR_META.TDRY0',
  'VAR_META.SO2_1',
  'VAR_EDDY.u_star',
  'VAR_META.PAR',
  'VAR_META.RH0',
  'VAR_META.WS00',
  'VAR_META.WDIR',
  'VAR_META.rainint'
]

var_interval = 30
var_aggregation = 'ARITHMETIC'

## Hyytiälä SMEAR II station 


hyy_tablevars = [
  'HYY_EDDY233.u_star',
  'HYY_EDDYSUB.u_star_subm',
  'HYY_EDDYMAST.u_star_270',
  #'HYY_EDDYTOW.u_star_radtow', insufficient data
  #'HYY_EDDYMAST.u_star_330', # No data
  #'HYY_EDDY233.u_star_460', # No data
  'HYY_META.SO2168',
  'HYY_META.RHTd',
  'HYY_META.RH33icos',
  'HYY_META.WS336',
  'HYY_META.WSU336',
  'HYY_META.WS42', # no data
  'HYY_META.WSU42', # not cup but sonic
  'HYY_META.WD', #avg of 16.8-67.2
  'HYY_META.WDU336',
  'HYY_META.T336',
  'HYY_META.T42', # partial data
  'HYY_META.PAR', # OK
  'HYY_META.PAR2', # duplicate instrument for PAR
  'HYY_META.maaPAR', #OK
]

hyy_interval = 30
hyy_aggregation = 'ARITHMETIC'

## Siikaneva SMEAR II station
## Just example parameters in template, specify actual parameters to be downloaded

sii_tablevars = [
  'SII1_META.T_a',
  'SII1_META.P'
]

sii_interval = 30
sii_aggregation = 'ARITHMETIC'



## Kumpula SMEAR III station
## Just example parameters in template specify actual parameters to be downloaded

kum_tablevars = [
  'KUM_META.t',
  'KUM_META.p'
]

kum_interval = 30
kum_aggregation = 'ARITHMETIC'



Cell below loads negative ion data from ```.txt``` files to dataframes and saves to local csv file. Format of source file is assumed to be:

timestamp (YYYY-MM-DD HH:MM:SS) concentration (float)

In [4]:
## Load negative ion data


kum_neg_ions_raw_df = pd.read_csv(FILE_PATHS['kum_neg_ions_txt'], sep=',', header=0, parse_dates=[0])
kum_start_date = kum_neg_ions_raw_df.iloc[0]['date']
kum_end_date = kum_neg_ions_raw_df.iloc[-1]['date']
kum_neg_ions_raw_df.to_csv(FILE_PATHS['kum_neg_ions_csv'], index=False)

var_neg_ions_raw_df = pd.read_csv(FILE_PATHS['var_neg_ions_txt'], sep=',', header=0, parse_dates=[0])
var_start_date = var_neg_ions_raw_df.iloc[0]['date'].to_pydatetime()
var_end_date = var_neg_ions_raw_df.iloc[-1]['date'].to_pydatetime()
var_neg_ions_raw_df.to_csv(FILE_PATHS['var_neg_ions_csv'], index=False)

hyy_neg_ions_raw_df = pd.read_csv(FILE_PATHS['hyy_neg_ions_txt'], sep=',', header=0, parse_dates=[0])
hyy_start_date = hyy_neg_ions_raw_df.iloc[0]['date'].to_pydatetime()
hyy_end_date = hyy_neg_ions_raw_df.iloc[-1]['date'].to_pydatetime()
hyy_neg_ions_raw_df.to_csv(FILE_PATHS['hyy_neg_ions_csv'], index=False)

sii_neg_ions_raw_df = pd.read_csv(FILE_PATHS['sii_neg_ions_txt'], sep=',', header=0, parse_dates=[0])
sii_start_date = sii_neg_ions_raw_df.iloc[0]['date'].to_pydatetime()
sii_end_date = sii_neg_ions_raw_df.iloc[-1]['date'].to_pydatetime()
sii_neg_ions_raw_df.to_csv(FILE_PATHS['sii_neg_ions_csv'], index=False)

print('Kumpula negative ions data from', kum_start_date, 'to', kum_end_date)
print('Värriö negative ions data from', var_start_date, 'to', var_end_date)
print('Hyytiälä negative ions data from', hyy_start_date, 'to', hyy_end_date)
print('Siikaneva negative ions data from', sii_start_date, 'to', sii_end_date)



Kumpula negative ions data from 2016-04-20 01:30:00 to 2024-11-11 23:30:00
Värriö negative ions data from 2019-02-09 00:00:00 to 2024-11-11 23:30:00
Hyytiälä negative ions data from 2010-02-02 00:00:00 to 2024-08-31 23:30:00
Siikaneva negative ions data from 2019-10-28 00:00:00 to 2024-12-17 23:30:00


Cell below loads metadata for all stations and all parameters specified to be downloaded and saves to local ```.csv```files.

In [5]:
# Load and record metadata

hyy_meta_data = avaa_api.getMetadata(tablevariables=hyy_tablevars, fmt='csv')
print(f"Metadata for all paramter for Hyytiälä loaded successfully: {len(hyy_meta_data)==len(hyy_tablevars)}")
hyy_meta_data.to_csv(FILE_PATHS['hyy_metadata'], index=False)

var_meta_data = avaa_api.getMetadata(tablevariables=var_tablevars, fmt='csv')
print(f"Metadata for all paramter for Värriö loaded successfully: {len(var_meta_data)==len(var_tablevars)}")
var_meta_data.to_csv(FILE_PATHS['var_metadata'], index=False)


sii_meta_data = avaa_api.getMetadata(tablevariables=sii_tablevars, fmt='csv')
print(f"Metadata for all paramter for Siikaneva loaded successfully: {len(sii_meta_data)==len(sii_tablevars)}")
sii_meta_data.to_csv(FILE_PATHS['sii_metadata'], index=False)

kum_meta_data = avaa_api.getMetadata(tablevariables=kum_tablevars, fmt='csv')
print(f"Metadata for all paramter for Kumpula loaded successfully: {len(kum_meta_data)==len(kum_tablevars)}")
kum_meta_data.to_csv(FILE_PATHS['kum_metadata'], index=False)




Metadata for all paramter for Hyytiälä loaded successfully: True
Metadata for all paramter for Värriö loaded successfully: True
Metadata for all paramter for Siikaneva loaded successfully: True
Metadata for all paramter for Kumpula loaded successfully: True


### Load Värriö data

In [6]:
#Load Värriö data and save dataframe to CSV
#load 1 year at a time due to API limitations

print(f"var start date: {var_start_date}")
print(f"var end date: {var_end_date}")

var_years = var_end_date.year - var_start_date.year

partial_datafames = []
start = var_start_date

for i in range(0, var_years+1):
  end = start + relativedelta(years=1)
  if end > var_end_date:
    end = var_end_date
  print(f"start: {start}, end: {end}")
  partial_datafames.append(avaa_api.getData(fdate=start, ldate=end, tablevariables=var_tablevars, interval=var_interval, aggregation=var_aggregation, timeout=timeout))
  start = end

var_data = pd.concat(partial_datafames)


print(var_data.describe())
var_data.to_csv(FILE_PATHS['var_raw'], index=False)

var start date: 2019-02-09 00:00:00
var end date: 2024-11-11 23:30:00
start: 2019-02-09 00:00:00, end: 2020-02-09 00:00:00
start: 2020-02-09 00:00:00, end: 2021-02-09 00:00:00
start: 2021-02-09 00:00:00, end: 2022-02-09 00:00:00
start: 2022-02-09 00:00:00, end: 2023-02-09 00:00:00
start: 2023-02-09 00:00:00, end: 2024-02-09 00:00:00
start: 2024-02-09 00:00:00, end: 2024-11-11 23:30:00
                  Datetime  VAR_META.TDRY0  VAR_META.SO2_1  VAR_EDDY.u_star  \
count               100943   100564.000000     92171.00000     97088.000000   
mean   2021-12-26 11:30:00        0.748549         0.28997         0.595818   
min    2019-02-09 00:00:00      -39.130200        -0.24400         0.009200   
25%    2020-07-18 17:45:00       -6.743037        -0.00467         0.383590   
50%    2021-12-26 11:30:00       -0.352400         0.02333         0.580900   
75%    2023-06-05 05:15:00        8.529760         0.11600         0.787312   
max    2024-11-11 23:00:00       30.561840        39.91333 

### Load Hyytiälä data

In [None]:
# Load Hyytiälä data and save dataframe to CSV
# load 1 year at a time due to API limitations


print(f"hyy start date: {hyy_start_date}")
print(f"hyy end date: {hyy_end_date}")

hyy_years = hyy_end_date.year - hyy_start_date.year
print(f"years: {hyy_years}")


partial_dataframes = []
start = hyy_start_date

for i in range(0, hyy_years+1):
  end = start + relativedelta(years=1)
  if end > hyy_end_date:
    end = hyy_end_date
  print(f"round {i}, fetching data for {start} - {end}")
  partial_data = avaa_api.getData(fdate=start, ldate=end, tablevariables=hyy_tablevars, interval=hyy_interval, aggregation=hyy_aggregation, timeout=timeout)
  partial_dataframes.append(partial_data)
  start = end

hyy_data = pd.concat(partial_dataframes)
hyy_data.reset_index(drop=True, inplace=True)

print(hyy_data.describe())
hyy_data.to_csv(FILE_PATHS['hyy_raw'], index=False)


hyy start date: 2010-02-02 00:00:00
hyy end date: 2024-08-31 23:30:00
years: 14
round 0, fetching data for 2010-02-02 00:00:00 - 2011-02-02 00:00:00
These columns will be missing:
   HYY_EDDYMAST.u_star_270
round 1, fetching data for 2011-02-02 00:00:00 - 2012-02-02 00:00:00
These columns will be missing:
   HYY_EDDYMAST.u_star_270
round 2, fetching data for 2012-02-02 00:00:00 - 2013-02-02 00:00:00
These columns will be missing:
   HYY_EDDYMAST.u_star_270
round 3, fetching data for 2013-02-02 00:00:00 - 2014-02-02 00:00:00
These columns will be missing:
   HYY_EDDYMAST.u_star_270
round 4, fetching data for 2014-02-02 00:00:00 - 2015-02-02 00:00:00
These columns will be missing:
   HYY_EDDYMAST.u_star_270
round 5, fetching data for 2015-02-02 00:00:00 - 2016-02-02 00:00:00
These columns will be missing:
   HYY_EDDYMAST.u_star_270
round 6, fetching data for 2016-02-02 00:00:00 - 2017-02-02 00:00:00
These columns will be missing:
   HYY_EDDYMAST.u_star_270
round 7, fetching data for 2017

### Load Siikaneva data

In [None]:
# Load Siikaneva data and save dataframe to CSV
# load 1 year at a time due to API limitations

print(f"sii start date: {sii_start_date}")
print(f"sii end date: {sii_end_date}")

sii_years = sii_end_date.year - sii_start_date.year
print(f"years: {sii_years}")


partial_dataframes = []
start = sii_start_date

for i in range(0, sii_years+1):
  end = start + relativedelta(years=1)
  if end > sii_end_date:
    end = sii_end_date
  print(f"round {i}, fetching data for {start} - {end}")
  partial_data = avaa_api.getData(fdate=start, ldate=end, tablevariables=sii_tablevars, interval=sii_interval, aggregation=sii_aggregation, timeout=timeout)
  partial_dataframes.append(partial_data)
  start = end

sii_data = pd.concat(partial_dataframes)
sii_data.reset_index(drop=True, inplace=True)

print(sii_data.describe())
sii_data.to_csv(FILE_PATHS['sii_raw'], index=False)

sii start date: 2019-10-28 00:00:00
sii end date: 2024-12-17 23:30:00
years: 5
round 0, fetching data for 2019-10-28 00:00:00 - 2020-10-28 00:00:00
round 1, fetching data for 2020-10-28 00:00:00 - 2021-10-28 00:00:00
round 2, fetching data for 2021-10-28 00:00:00 - 2022-10-28 00:00:00
round 3, fetching data for 2022-10-28 00:00:00 - 2023-10-28 00:00:00
round 4, fetching data for 2023-10-28 00:00:00 - 2024-10-28 00:00:00
round 5, fetching data for 2024-10-28 00:00:00 - 2024-12-17 23:30:00
                  Datetime  SII1_META.T_a   SII1_META.P
count                90143   89768.000000  88063.000000
mean   2022-05-23 23:30:00       4.718504    991.064356
min    2019-10-28 00:00:00     -32.328500    937.638620
25%    2021-02-08 11:45:00      -1.660735    983.810305
50%    2022-05-23 23:30:00       3.668585    992.309120
75%    2023-09-05 11:15:00      12.159568    999.360275
max    2024-12-17 23:00:00      31.713000   1024.253800
std                    NaN       9.806667     11.947440


### Load Kumpula data

In [9]:
# Load Kumpula data and save dataframe to CSV
# load 1 year at a time due to API limitations



print(f"kum start date: {kum_start_date}")
print(f"kum end date: {kum_end_date}")

kum_years = kum_end_date.year - kum_start_date.year
print(f"years: {kum_years}")


partial_dataframes = []
start = kum_start_date

for i in range(0, kum_years+1):
  end = start + relativedelta(years=1)
  if end > kum_end_date:
    end = kum_end_date
  print(f"round {i}, fetching data for {start} - {end}")
  partial_data = avaa_api.getData(fdate=start, ldate=end, tablevariables=kum_tablevars, interval=kum_interval, aggregation=kum_aggregation, timeout=timeout)
  partial_dataframes.append(partial_data)
  start = end

kum_data = pd.concat(partial_dataframes)
kum_data.reset_index(drop=True, inplace=True)

print(kum_data.describe())
kum_data.to_csv(FILE_PATHS['kum_raw'], index=False)

kum start date: 2016-04-20 01:30:00
kum end date: 2024-11-11 23:30:00
years: 8
round 0, fetching data for 2016-04-20 01:30:00 - 2017-04-20 01:30:00
round 1, fetching data for 2017-04-20 01:30:00 - 2018-04-20 01:30:00
round 2, fetching data for 2018-04-20 01:30:00 - 2019-04-20 01:30:00
round 3, fetching data for 2019-04-20 01:30:00 - 2020-04-20 01:30:00
round 4, fetching data for 2020-04-20 01:30:00 - 2021-04-20 01:30:00
round 5, fetching data for 2021-04-20 01:30:00 - 2022-04-20 01:30:00
round 6, fetching data for 2022-04-20 01:30:00 - 2023-04-20 01:30:00
round 7, fetching data for 2023-04-20 01:30:00 - 2024-04-20 01:30:00
round 8, fetching data for 2024-04-20 01:30:00 - 2024-11-11 23:30:00
                            Datetime     KUM_META.t     KUM_META.p
count                         150140  149966.000000  149966.000000
mean   2020-08-01 00:14:59.999999488       7.486667    1006.167295
min              2016-04-20 01:30:00     -21.593330     955.530000
25%              2018-06-11 00:5