# FETCHING, PREPROCESSING, AND MODELING DATA

## Table of Contents:
* [Selecting Networks, Stations, and Time Range](#select)
* [Fetching Data](#fetch)
  * [Fetching Network and Station Info](#fetchinfo)
  * [Fetching Station Data](#fetchdata)
* [Preprocessing Data](#preprocess)
  * [Cleaning Fetched Station Data](#cleandata)
  * [Scaling Clean Station Data](#scaledata)
  * [Splitting Clean Station Data](#splitdata)
  * [Combining Preprocessing Steps](#combinesteps)
* [Modeling Data](#model)
  * [Clustering Split Clean Station Data](#clustersplitdata)

In [1]:
# imports
import os
import sys
module_path = os.path.abspath(os.getcwd() + "\\..")
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
pd.set_option("display.max_rows", 10)

import fetching
import preprocessing
import modeling
import utils

<a id="select"></a>
## Selecting Networks, Stations, and Time Range

Note: Visual examples in notebook are inteded for networks GR_\_ASOS and CY_\_ASOS, station LGAV, and time range 1/1/2018 - 1/1/2022

In [2]:
# select networks with config file
network_list = utils.read_config_options(networks_config=True)

In [3]:
network_list

['GR__ASOS', 'CY__ASOS']

In [4]:
# select stations with config file
station_list = utils.read_config_options(stations_config=True)

In [5]:
station_list

['LGAV']

In [6]:
# select time range for data with config file
start_datetime, end_datetime = utils.read_config_options(datetimes_config=True)

In [7]:
start_datetime, end_datetime

(Timestamp('2018-01-01 00:00:00'), Timestamp('2022-01-01 00:00:00'))

<a id="fetch"></a>
## Fetching Data

<a id="fetchinfo"></a>
### Fetching Network and Station Info

In [8]:
# fetch network info of all networks
fetching.fetch_and_output_network_info()

In [9]:
# input all network info of every available network
network_info = utils.read_network_info()

In [10]:
network_info

Unnamed: 0,network,network_name
0,AF__ASOS,Afghanistan ASOS
1,AL_ASOS,Alabama ASOS
2,AK_ASOS,Alaska ASOS
3,AL__ASOS,Albania ASOS
4,CA_AB_ASOS,Alberta CA ASOS
...,...,...
258,WY_ASOS,Wyoming ASOS
259,YE__ASOS,Yemen ASOS
260,CA_YT_ASOS,Yukon Canada ASOS
261,ZM__ASOS,Zambia ASOS


In [11]:
# fetch station info of selected networks
fetching.fetch_and_output_station_info(networks=network_list)

In [12]:
# input all station info of every available station
station_info = utils.read_station_info()

In [13]:
station_info

Unnamed: 0,elevation,sname,state,country,tzname,county,sid,network,start,end,lat,lon
0,3.937641,Alexandroupoli,,GR,Europe/Athens,,LGAL,GR__ASOS,1932-01-01,NaT,40.8562,25.9450
1,12.000000,Andravida,,GR,Europe/Athens,,LGAD,GR__ASOS,1965-01-01,NaT,37.9207,21.2926
2,94.000000,Athens,,GR,Europe/Athens,,LGAV,GR__ASOS,2004-01-01,NaT,37.9364,23.9445
3,135.000000,Chania,,GR,Europe/Athens,,LGSA,GR__ASOS,1932-01-01,NaT,35.4833,24.1167
4,4.000000,Chios,,GR,Europe/Athens,,LGHI,GR__ASOS,1991-01-01,NaT,38.3432,26.1406
...,...,...,...,...,...,...,...,...,...,...,...,...
36,20.000000,Geçitkale,,CY,Asia/Nicosia,,LCGK,CY__ASOS,1986-01-01,2012-01-01,35.2526,33.7358
37,2.000000,Larnaca,,CY,Asia/Nicosia,,LCLK,CY__ASOS,1976-01-01,NaT,34.8733,33.6172
38,91.000000,Nicosia,,CY,Asia/Nicosia,,LCEN,CY__ASOS,2011-01-01,NaT,35.1489,33.4997
39,161.000000,NICOSIA/ATHALASSA,,CY,Asia/Nicosia,,LCNC,CY__ASOS,1990-01-01,2020-01-01,35.1408,33.3964


<a id="fetchdata"></a>
### Fetching Station Data

In [14]:
# fetch station data for selected time range and stations
fetching.fetch_and_output_station_data(start_datetime=start_datetime, end_datetime=end_datetime,
                                       stations=station_list)

In [15]:
# input fetched station data for station LGAV in network GR__ASOS for selected time range
# select timestamp column as index
station_data = utils.read_station_data(station="LGAV", start_datetime=start_datetime, end_datetime=end_datetime,
                                       network="GR__ASOS",
                                       category="fetched",
                                       set_index_column="valid")

In [16]:
station_data

Unnamed: 0_level_0,station,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,...,skyl3,skyl4,wxcodes,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel
valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01 00:20:00,LGAV,37.4,30.2,74.98,350.0,1.0,0.0,30.15,,6.21,...,,,,,,,,,,37.40
2018-01-01 00:50:00,LGAV,35.6,30.2,80.51,0.0,0.0,0.0,30.15,,6.21,...,,,,,,,,,,35.60
2018-01-01 01:20:00,LGAV,33.8,30.2,86.49,290.0,2.0,0.0,30.15,,6.21,...,,,,,,,,,,33.80
2018-01-01 01:50:00,LGAV,35.6,32.0,86.59,290.0,5.0,0.0,30.12,,6.21,...,,,,,,,,,,30.67
2018-01-01 02:20:00,LGAV,32.0,28.4,86.39,320.0,4.0,0.0,30.12,,6.21,...,,,,,,,,,,27.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31 21:50:00,LGAV,55.4,37.4,50.64,30.0,7.0,0.0,30.15,,6.21,...,,,,,,,,,,55.40
2021-12-31 22:20:00,LGAV,55.4,37.4,50.64,,3.0,0.0,30.15,,6.21,...,,,,,,,,,,55.40
2021-12-31 22:50:00,LGAV,53.6,37.4,54.07,60.0,7.0,0.0,30.15,,6.21,...,,,,,,,,,,53.60
2021-12-31 23:20:00,LGAV,50.0,39.2,66.27,40.0,6.0,0.0,30.15,,6.21,...,,,,,,,,,,50.00


In [17]:
# fetch station data of all stations up to a period of 24 hours
# reports are limited to routine and specials, timestamp timezone is UTC, trace reports are left in data
fetching.fetch_and_output_station_data(start_datetime=pd.to_datetime("2022-01-01"),
                                       end_datetime=pd.to_datetime("2022-01-02"),
                                       report="combined", timezone="Etc/UTC", trace=True)

In [18]:
# input fetched station data from all stations up to a period of 24 hours
# input only main columns
# select timestamp column as index
station_data = utils.read_station_data(station="ALL",
                                       start_datetime=pd.to_datetime("2022-01-01"),
                                       end_datetime=pd.to_datetime("2022-01-02"),
                                       network="ALL",
                                       category="fetched",
                                       main_columns=True,
                                       set_index_column="valid")

In [19]:
station_data

Unnamed: 0_level_0,tmpf,dwpf,relh,sknt,skyc1,skyl1,feel
valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-01 00:00:00,24.8,8.6,49.59,5.0,FEW,2500.0,17.84
2022-01-01 00:01:00,51.1,46.9,85.48,7.0,FEW,700.0,51.10
2022-01-01 00:02:00,77.0,69.8,78.48,7.0,FEW,1400.0,77.00
2022-01-01 00:03:00,35.1,32.0,88.33,9.0,BKN,800.0,27.37
2022-01-01 00:04:00,44.1,44.1,100.00,4.0,SCT,1900.0,41.55
...,...,...,...,...,...,...,...
2022-01-01 23:55:00,5.0,-2.0,72.18,17.0,OVC,3700.0,-15.30
2022-01-01 23:56:00,73.9,72.0,93.79,12.0,FEW,500.0,73.90
2022-01-01 23:57:00,28.4,26.6,92.86,12.0,SCT,700.0,17.42
2022-01-01 23:58:00,47.7,28.0,46.13,20.0,FEW,4200.0,39.99


<a id="preprocess"></a>
## Preprocessing Data

<a id="cleandata"></a>
### Cleaning Fetched Station Data

In [20]:
# clean fetched station data for selected time range and stations
# clean all important columns, not only the main important columns
preprocessing.clean_and_output_station_data(start_datetime, end_datetime, station_list,
                                            main_columns=False)

In [21]:
# input preprocessed clean station data for station LGAV in network GR__ASOS for selected time range
# select timestamp column as index
station_data = utils.read_station_data(station="LGAV", start_datetime=start_datetime, end_datetime=end_datetime,
                                       network="GR__ASOS",
                                       category="preprocessed", subcategory="clean",
                                       set_index_column="valid")

In [22]:
station_data

Unnamed: 0_level_0,tmpf,dwpf,relh,sknt,p01i,vsby,gust,skyc1,skyc2,skyc3,skyl1,skyl2,skyl3,feel,drct_sin,drct_cos
valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-01-01 00:00:00,36.5,30.2,77.745,0.5,0.0,6.21,0.0,1.0,3.0,3.0,3500.0,7000.0,6000.0,36.500,0.087156,-0.996195
2018-01-01 01:00:00,34.7,31.1,86.540,3.5,0.0,6.21,0.0,1.0,3.0,3.0,3500.0,7000.0,6000.0,32.235,-0.939693,0.342020
2018-01-01 02:00:00,32.9,29.3,86.440,3.5,0.0,6.21,0.0,1.0,3.0,3.0,3500.0,7000.0,6000.0,29.085,-0.965926,0.258819
2018-01-01 03:00:00,35.6,30.2,80.510,3.5,0.0,6.21,0.0,1.0,3.0,3.0,3500.0,7000.0,6000.0,32.210,-0.984808,0.173648
2018-01-01 04:00:00,35.6,30.2,80.735,3.0,0.0,6.21,0.0,1.0,3.0,3.0,3500.0,7000.0,6000.0,33.460,-0.906308,0.422618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31 19:00:00,53.6,37.4,54.205,5.5,0.0,6.21,0.0,1.0,3.0,3.0,2500.0,2500.0,3500.0,53.600,-0.500000,0.866025
2021-12-31 20:00:00,54.5,37.4,52.605,6.0,0.0,6.21,0.0,1.0,3.0,3.0,2500.0,2500.0,3500.0,54.500,-0.087156,-0.996195
2021-12-31 21:00:00,56.3,37.4,49.040,9.0,0.0,6.21,0.0,1.0,3.0,3.0,2500.0,2500.0,3500.0,56.300,0.342020,0.939693
2021-12-31 22:00:00,54.5,37.4,52.355,5.0,0.0,6.21,0.0,1.0,3.0,3.0,2500.0,2500.0,3500.0,54.500,0.866025,0.500000


<a id="scaledata"></a>
### Scaling Clean Station Data

In [23]:
# scale clean station data for selected time range and stations
# scale only the main important columns
# scale clean data using saved MinMaxScaler and StandardScaler (new if scaler files don't exist)
preprocessing.scale_and_output_station_data(start_datetime, end_datetime, station_list,
                                            main_columns=True,
                                            scaler=["minmax", "standard"], new_scaler=False)

In [24]:
# input preprocessed clean station data scaled with MinMaxScaler for station LGAV in network GR__ASOS for selected time range
# select timestamp column as index
station_data = utils.read_station_data(station="LGAV", start_datetime=start_datetime, end_datetime=end_datetime,
                                       network="GR__ASOS",
                                       category="preprocessed", subcategory="scaled_minmax",
                                       set_index_column="valid")

In [25]:
station_data

Unnamed: 0_level_0,tmpf,dwpf,relh,sknt,skyc1,skyl1,feel
valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01 00:00:00,0.127907,0.312500,0.748787,0.012987,0.25,0.320388,0.194238
2018-01-01 01:00:00,0.104651,0.328125,0.848064,0.090909,0.25,0.320388,0.142719
2018-01-01 02:00:00,0.081395,0.296875,0.846935,0.090909,0.25,0.320388,0.104669
2018-01-01 03:00:00,0.116279,0.312500,0.779998,0.090909,0.25,0.320388,0.142417
2018-01-01 04:00:00,0.116279,0.312500,0.782538,0.077922,0.25,0.320388,0.157516
...,...,...,...,...,...,...,...
2021-12-31 19:00:00,0.348837,0.437500,0.483068,0.142857,0.25,0.223301,0.400797
2021-12-31 20:00:00,0.360465,0.437500,0.465007,0.155844,0.25,0.223301,0.411669
2021-12-31 21:00:00,0.383721,0.437500,0.424766,0.233766,0.25,0.223301,0.433412
2021-12-31 22:00:00,0.360465,0.437500,0.462185,0.129870,0.25,0.223301,0.411669


In [26]:
# input preprocessed clean station data scaled with StandardScaler for station LGAV in network GR__ASOS for selected time range
# select timestamp column as index
station_data = utils.read_station_data(station="LGAV", start_datetime=start_datetime, end_datetime=end_datetime,
                                       network="GR__ASOS",
                                       category="preprocessed", subcategory="scaled_standard",
                                       set_index_column="valid")

In [27]:
station_data

Unnamed: 0_level_0,tmpf,dwpf,relh,sknt,skyc1,skyl1,feel
valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01 00:00:00,-2.058987,-2.033359,0.996082,-1.215137,-0.361833,1.218478,-1.934166
2018-01-01 01:00:00,-2.190948,-1.934719,1.485775,-0.714693,-0.361833,1.218478,-2.234459
2018-01-01 02:00:00,-2.322909,-2.131998,1.480207,-0.714693,-0.361833,1.218478,-2.456247
2018-01-01 03:00:00,-2.124968,-2.033359,1.150034,-0.714693,-0.361833,1.218478,-2.236220
2018-01-01 04:00:00,-2.124968,-2.033359,1.162561,-0.798101,-0.361833,1.218478,-2.148209
...,...,...,...,...,...,...,...
2021-12-31 19:00:00,-0.805361,-1.244243,-0.314590,-0.381064,-0.361833,-0.181538,-0.730179
2021-12-31 20:00:00,-0.739380,-1.244243,-0.403676,-0.297657,-0.361833,-0.181538,-0.666811
2021-12-31 21:00:00,-0.607420,-1.244243,-0.602170,0.202786,-0.361833,-0.181538,-0.540075
2021-12-31 22:00:00,-0.739380,-1.244243,-0.417595,-0.464472,-0.361833,-0.181538,-0.666811


In [28]:
# scale clean station data for selected time range and stations
# scale all important columns, not only the main important columns
# scale clean data using saved MinMaxScaler (new if scaler file doesn't exist)
preprocessing.scale_and_output_station_data(start_datetime, end_datetime, station_list,
                                            main_columns=False,
                                            scaler="minmax", new_scaler=False)

In [29]:
# input preprocessed clean station data scaled with MinMaxScaler for station LGAV in network GR__ASOS for selected time range
# select timestamp column as index
station_data = utils.read_station_data(station="LGAV", start_datetime=start_datetime, end_datetime=end_datetime,
                                       network="GR__ASOS",
                                       category="preprocessed", subcategory="scaled_minmax",
                                       set_index_column="valid")

In [30]:
station_data

Unnamed: 0_level_0,tmpf,dwpf,relh,sknt,p01i,vsby,gust,skyc1,skyc2,skyc3,skyl1,skyl2,skyl3,feel,drct_sin,drct_cos
valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-01-01 00:00:00,0.127907,0.312500,0.748787,0.012987,0.0,1.0,0.0,0.25,0.666667,0.5,0.320388,0.16,0.083977,0.194238,0.543578,0.001903
2018-01-01 01:00:00,0.104651,0.328125,0.848064,0.090909,0.0,1.0,0.0,0.25,0.666667,0.5,0.320388,0.16,0.083977,0.142719,0.030154,0.671010
2018-01-01 02:00:00,0.081395,0.296875,0.846935,0.090909,0.0,1.0,0.0,0.25,0.666667,0.5,0.320388,0.16,0.083977,0.104669,0.017037,0.629410
2018-01-01 03:00:00,0.116279,0.312500,0.779998,0.090909,0.0,1.0,0.0,0.25,0.666667,0.5,0.320388,0.16,0.083977,0.142417,0.007596,0.586824
2018-01-01 04:00:00,0.116279,0.312500,0.782538,0.077922,0.0,1.0,0.0,0.25,0.666667,0.5,0.320388,0.16,0.083977,0.157516,0.046846,0.711309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31 19:00:00,0.348837,0.437500,0.483068,0.142857,0.0,1.0,0.0,0.25,0.666667,0.5,0.223301,0.04,0.035714,0.400797,0.250000,0.933013
2021-12-31 20:00:00,0.360465,0.437500,0.465007,0.155844,0.0,1.0,0.0,0.25,0.666667,0.5,0.223301,0.04,0.035714,0.411669,0.456422,0.001903
2021-12-31 21:00:00,0.383721,0.437500,0.424766,0.233766,0.0,1.0,0.0,0.25,0.666667,0.5,0.223301,0.04,0.035714,0.433412,0.671010,0.969846
2021-12-31 22:00:00,0.360465,0.437500,0.462185,0.129870,0.0,1.0,0.0,0.25,0.666667,0.5,0.223301,0.04,0.035714,0.411669,0.933013,0.750000


<a id="splitdata"></a>
### Splitting Clean Station Data

In [31]:
# split clean station data for selected time range and stations
# split only the main important columns
# split only clean data columns
preprocessing.split_and_output_station_data(start_datetime, end_datetime, station_list,
                                            main_columns=True,
                                            split_columns="clean")

In [32]:
# input preprocessed split clean station data of temperature column for station LGAV in network GR__ASOS for selected time range
# select timestamp column as index
station_data = utils.read_station_data(station="LGAV", start_datetime=start_datetime, end_datetime=end_datetime,
                                       network="GR__ASOS",
                                       column="tmpf",
                                       category="preprocessed", subcategory="clean",
                                       set_index_column="valid")

In [33]:
station_data

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,34.030862,34.650495,35.430459,36.353585,37.539599,40.653567,44.710856,49.215506,53.823551,57.567158,...,58.808696,57.422078,56.077019,54.691041,53.244034,52.202310,52.341825,52.226097,52.137753,52.130283
2018-01-02,53.406390,53.509714,53.639052,53.829161,53.917375,54.534656,55.548888,56.920711,58.393132,59.634019,...,58.487370,56.602075,54.642518,53.204411,52.609786,52.231907,51.702538,51.221405,50.734233,50.205196
2018-01-03,47.927456,47.704333,47.561935,47.458669,47.330485,48.078804,49.536368,51.183385,52.803553,54.447099,...,54.488358,52.325492,49.710703,47.158202,45.145768,44.147991,44.391672,44.576266,44.867396,45.246158
2018-01-04,49.714237,50.036949,50.297736,50.473861,50.482826,50.027728,49.374697,48.528404,47.563525,46.849101,...,47.452525,47.884631,48.368047,48.866650,49.286131,49.657711,49.823314,49.954249,50.058868,50.140865
2018-01-05,49.201990,48.067867,47.059480,46.148748,45.330973,46.132103,47.944407,50.243023,52.608291,54.747635,...,54.251902,52.144661,49.701946,47.356239,45.215386,43.182240,41.465912,39.813571,38.169375,36.566791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,58.952667,58.978369,59.019147,59.089020,59.242331,59.744911,60.273263,60.881542,61.586489,62.168986,...,60.833374,60.166600,59.666331,59.284843,59.054270,59.000000,59.000000,59.000000,59.000000,59.000000
2021-12-28,58.109741,58.018879,57.950037,57.913173,57.867534,57.994439,58.470558,59.222471,59.924163,60.385534,...,58.778125,57.978170,57.205629,56.568948,56.187209,56.073456,56.071026,56.041943,56.034319,56.022061
2021-12-29,54.973297,54.304183,53.642829,53.018168,52.376085,52.228326,52.889278,54.001028,55.430888,56.871621,...,56.976963,56.172158,55.214591,54.170112,52.973992,51.475856,49.834849,48.227508,46.601253,44.977401
2021-12-30,45.886507,47.599114,49.284227,50.929609,52.542367,53.800328,54.601569,55.052976,55.321193,55.327554,...,53.638750,53.475502,53.419878,53.338958,53.091588,52.821126,52.784038,52.776003,52.762912,52.779584


In [34]:
# split clean station data for selected time range and stations
# split only the main important columns
# split both clean and scaled clean data columns
# for scaled data split only clean data scaled with MinMaxScaler
preprocessing.split_and_output_station_data(start_datetime, end_datetime, station_list,
                                            main_columns=True,
                                            split_columns="all", scaler="minmax")

In [35]:
# input preprocessed split clean station data scaled with MinMaxScaler of temperature column
# for station LGAV in network GR__ASOS for selected time range
# select timestamp column as index
station_data = utils.read_station_data(station="LGAV", start_datetime=start_datetime, end_datetime=end_datetime,
                                       network="GR__ASOS",
                                       column="tmpf",
                                       category="preprocessed", subcategory="scaled_minmax",
                                       set_index_column="valid")

In [36]:
station_data

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,0.096006,0.104012,0.114089,0.126015,0.141338,0.181571,0.233990,0.292190,0.351725,0.400092,...,0.416133,0.398218,0.380840,0.362933,0.344238,0.330779,0.332582,0.331087,0.329945,0.329849
2018-01-02,0.346336,0.347671,0.349342,0.351798,0.352938,0.360913,0.374017,0.391740,0.410764,0.426796,...,0.411982,0.387624,0.362306,0.343726,0.336044,0.331162,0.324322,0.318106,0.311812,0.304977
2018-01-03,0.275549,0.272666,0.270826,0.269492,0.267836,0.277504,0.296336,0.317615,0.338547,0.359782,...,0.360315,0.332371,0.298588,0.265610,0.239609,0.226718,0.229867,0.232251,0.236013,0.240906
2018-01-04,0.298634,0.302803,0.306172,0.308448,0.308564,0.302684,0.294247,0.283313,0.270847,0.261616,...,0.269412,0.274995,0.281241,0.287683,0.293102,0.297903,0.300043,0.301734,0.303086,0.304146
2018-01-05,0.292015,0.277363,0.264334,0.252568,0.242002,0.252353,0.275768,0.305465,0.336024,0.363665,...,0.357260,0.330034,0.298475,0.268168,0.240509,0.214241,0.192066,0.170718,0.149475,0.128770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,0.417993,0.418325,0.418852,0.419755,0.421736,0.428229,0.435055,0.442914,0.452022,0.459548,...,0.442292,0.433677,0.427214,0.422285,0.419306,0.418605,0.418605,0.418605,0.418605,0.418605
2021-12-28,0.407103,0.405929,0.405039,0.404563,0.403973,0.405613,0.411764,0.421479,0.430545,0.436506,...,0.415738,0.405403,0.395422,0.387196,0.382264,0.380794,0.380763,0.380387,0.380288,0.380130
2021-12-29,0.366580,0.357935,0.349391,0.341320,0.333024,0.331115,0.339655,0.354018,0.372492,0.391106,...,0.392467,0.382069,0.369698,0.356203,0.340749,0.321393,0.300192,0.279425,0.258414,0.237434
2021-12-30,0.249180,0.271306,0.293078,0.314336,0.335173,0.351425,0.361777,0.367610,0.371075,0.371157,...,0.349338,0.347229,0.346510,0.345465,0.342269,0.338774,0.338295,0.338191,0.338022,0.338238


<a id="combinesteps"></a>
### Combining Preprocessing Steps

In [37]:
# preprocess fetched station data for selected time range and stations
# preprocess only main important columns
# split only clean data columns
# scale clean data using saved MinMaxScaler (new if scaler file doesn't exist)
preprocessing.preprocess_and_output_station_data(start_datetime, end_datetime, station_list,
                                                 main_columns=True,
                                                 split_columns="clean",
                                                 scaler="minmax", new_scaler=False)

<a id="model"></a>
## Modeling Data

<a id="clustersplitdata"></a>
### Clustering Split Clean Station Data

In [38]:
# cluster split clean station data for selected time range and stations
# create 15 clusters for each split column
# cluster data using saved TimeSeriesKMeans models (new if model files don't exist)
modeling.cluster_and_output_station_data(start_datetime, end_datetime, station_list,
                                         clusters=15, new_model=False)

In [39]:
# input clustered split clean station data for station LGAV in network GR__ASOS for selected time range
# select timestamp column as index
station_data = utils.read_station_data(station="LGAV", start_datetime=start_datetime, end_datetime=end_datetime,
                                       network="GR__ASOS",
                                       category="modeled", subcategory="kmeans_timeseries",
                                       set_index_column="valid")

In [40]:
station_data

Unnamed: 0_level_0,tmpf,dwpf,relh,sknt,skyc1,skyl1,feel
valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01,6,3,5,13,0,1,3
2018-01-02,10,6,8,13,0,2,1
2018-01-03,1,2,0,1,4,8,13
2018-01-04,5,2,11,10,4,0,13
2018-01-05,12,3,7,12,0,5,13
...,...,...,...,...,...,...,...
2021-12-27,4,11,6,0,3,0,5
2021-12-28,4,7,9,12,0,10,5
2021-12-29,10,9,10,8,8,5,1
2021-12-30,10,0,4,5,5,4,1
