# Format input data and split into training & validation
Combine the preprocessed labels (precip data) and features (reanalysis data) into one csv file. Split data into training and validation.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# Directory for input data 
data_dir = "../data/input_data_preprocessed/"

# Read labels csv as pandas DataFrame object 
labels_df = pd.read_csv(data_dir+"precip_classes.csv")

# Read features csv as pandas DataFrame object
features_df = pd.read_csv(data_dir+"slp_hgt_anoms.csv")

# Merge DataFrames on time column 
# Will ensure that the data has the same time index 
input_df = labels_df.merge(features_df, on="time")

# Format time 
datetime_np = pd.to_datetime(input_df["time"].values)
input_df["year"] = datetime_np.year
input_df["month"] = datetime_np.month
input_df["day"] = datetime_np.day
input_df = input_df.drop(labels="time", axis="columns")

display(input_df)

In [None]:
# Format input data 
labels_list = ["precip_classes"]
labels_np = input_df[labels_list].values

# Convert to numpy array
features_list = ["slp_anom","hgt_detrended_anom","year","month","day"]
features_np = input_df[features_list].values

In [None]:
# Split into training (80%) and validation (20%) 
train_features, val_features, train_labels, val_labels = train_test_split(
    features_np, 
    labels_np, 
    test_size=0.20, 
    random_state=18
)

In [None]:
# Format as DataFrames and save as csv 

## Training data 
features_train_df = pd.DataFrame(data=train_features, columns=features_list)
features_train_df.to_csv(data_dir+"training/features_train.csv", index=False)

labels_train_df = pd.DataFrame(data=train_labels, columns=labels_list)
labels_train_df.to_csv(data_dir+"training/labels_train.csv", index=False)

## Testing data 
features_val_df = pd.DataFrame(data=val_features, columns=features_list)
features_val_df.to_csv(data_dir+"validation/features_validation.csv", index=False) 

labels_val_df = pd.DataFrame(data=val_labels, columns=labels_list)
labels_val_df.to_csv(data_dir+"validation/labels_validation.csv", index=False)