# Format input data and split into training & validation
Combine the preprocessed labels (precip data) and features (reanalysis data) into one csv file. Split data into training and validation.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Directory for input data 
data_dir = "../data/input_data_preprocessed/"

# Read labels csv as pandas DataFrame object 
labels_df = pd.read_csv(data_dir+"precip_classes.csv")

# Read features csv as pandas DataFrame object
features_df = pd.read_csv(data_dir+"slp_hgt_anoms.csv")

# Merge DataFrames on time column 
# Will ensure that the data has the same time index 
input_df = labels_df.merge(features_df, on="time")

# Format time 
datetime_np = pd.to_datetime(input_df["time"].values)
input_df["year"] = datetime_np.year
input_df["month"] = datetime_np.month
input_df["day"] = datetime_np.day
input_df = input_df.drop(labels="time", axis="columns")

display(input_df)

## Split the data by year into testing-validation-training

In [None]:
## Testing: 4 years of data (2001-2004)
testing_df = input_df[input_df["year"] <= 2004]

## Validation: 4 years of data (2005-2008)
validation_df = input_df[(input_df["year"] >= 2005) & (input_df["year"] < 2009)]

## Training: 12 years of data (2009-2020)
training_df = input_df[input_df["year"] >= 2009]

## Save as csv 

In [None]:
# Columns to use for labels vs. features 
labels_list = ["precip_classes"]
features_list = ["slp_anom","hgt_detrended_anom","year","month","day"]

# Training data 
training_df[features_list].to_csv(data_dir+"training/features_training.csv", index=False)
training_df[labels_list].to_csv(data_dir+"training/labels_training.csv", index=False)

# Validation data 
validation_df[features_list].to_csv(data_dir+"validation/features_validation.csv", index=False)
validation_df[labels_list].to_csv(data_dir+"validation/labels_validation.csv", index=False)

# Testing data 
testing_df[features_list].to_csv(data_dir+"testing/features_testing.csv", index=False)
testing_df[labels_list].to_csv(data_dir+"testing/labels_testing.csv", index=False)