# Importing packages

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf 
import seaborn as sns 
import matplotlib as plt
sns.set()


# Data Processing

## Data Loading

In [None]:
# importing data

raw_data = pd.read_csv('six_blades.csv')
raw_data.head()

## Data Cleaning

In [None]:
raw_data1 = raw_data.iloc[3:, :]
raw_data1.columns = ['Date_April','Max_T','Min_T', 'RH(%)', 'ws', 'WS(mps)', 'rf','Den(Be)','Evp(mm)']
raw_data2 = raw_data1.drop(columns=['ws','rf'])
raw_data2.reset_index(drop=True, inplace=True)
raw_data2.head()


In [None]:
# input data
data = raw_data2.iloc[:,0:-1]
data.head()


In [None]:
# target data
target = raw_data2.iloc[:,-1]
target.head()

## Data Checkpoint

In [None]:
# data checkpoint
input_data = data.copy()

## Standardlisation of data

In [None]:
# standardlisation of data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaled_input = scaler.fit_transform(input_data)

## Deploying Scaler

In [None]:
import joblib
joblib.dump(scaler, 'spem_scaler.pkl')

## Shuffling the data

In [None]:
shuffled_indices = np.arange(scaled_input.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_input[shuffled_indices]
shuffled_targets = target[shuffled_indices]


## Data split

In [None]:
sample_count = shuffled_indices.shape[0]

train_sample_count = int(0.8 * sample_count)
validation_sample_count = int(0.1 * sample_count)
test_sample_count = int(0.1 * sample_count)

In [None]:
train_input = shuffled_inputs[:train_sample_count]
train_targets = shuffled_targets[:train_sample_count]

In [None]:
validation_input = shuffled_inputs[train_sample_count:train_sample_count+validation_sample_count]
validation_target = shuffled_targets[train_sample_count:train_sample_count+validation_sample_count]

In [None]:
test_input = shuffled_inputs[train_sample_count+validation_sample_count:]
test_target = shuffled_targets[train_sample_count+validation_sample_count:]

In [None]:
print(np.sum(train_targets), train_sample_count, np.sum(train_targets) / train_sample_count)
print(np.sum(validation_target), validation_sample_count, np.sum(validation_target) / validation_sample_count)
print(np.sum(test_target), test_sample_count, np.sum(test_target) / test_sample_count)

## Save datasets in *.npz

In [None]:
np.savez('seawater_data_train', inputs=train_input, targets=train_targets)
np.savez('seawater_data_validation', inputs=validation_input, targets=validation_target)
np.savez('seawater_data_test', inputs=test_input, targets=test_target)