# Importing packages

In [49]:
import pandas as pd
import numpy as np
import tensorflow as tf 
import seaborn as sns 
import matplotlib as plt
sns.set()


# Data Processing

## Data Loading

In [57]:
# importing data

raw_data = pd.read_csv('sea_water.csv')
raw_data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,,,,,,,,,
1,,,,,,,,,
2,1.0,38.2,22.6,33.0,22.0,6.111111,0.0,5.2,1.0
3,2.0,37.3,22.6,49.0,24.0,6.666667,0.0,5.5,3.45
4,3.0,37.7,21.9,37.0,14.0,3.888889,0.0,5.7,3.68


## Data Cleaning

In [62]:
raw_data1 = raw_data.iloc[3:, :]
raw_data1.columns = ['Date April','Max_T','Min_T', 'RH(%)', 'ws', 'WS(mps)', 'rf','Den(Be)','Evp(mm)']
raw_data2 = raw_data1.drop(columns=['ws','rf'])
raw_data2.reset_index(drop=True, inplace=True)
raw_data2.head()


Unnamed: 0,Date April,Max_T,Min_T,RH(%),WS(mps),Den(Be),Evp(mm)
0,2.0,37.3,22.6,49.0,6.666667,5.5,3.45
1,3.0,37.7,21.9,37.0,3.888889,5.7,3.68
2,4.0,39.2,25.2,21.0,3.888889,6.1,5.01
3,5.0,39.5,25.4,44.0,6.111111,6.5,5.44
4,6.0,38.0,25.6,32.0,5.0,6.9,3.61


In [64]:
# input data
data = raw_data2.iloc[:,0:-1]
data.head()


Unnamed: 0,Date April,Max_T,Min_T,RH(%),WS(mps),Den(Be)
0,2.0,37.3,22.6,49.0,6.666667,5.5
1,3.0,37.7,21.9,37.0,3.888889,5.7
2,4.0,39.2,25.2,21.0,3.888889,6.1
3,5.0,39.5,25.4,44.0,6.111111,6.5
4,6.0,38.0,25.6,32.0,5.0,6.9


In [67]:
# target data
target = raw_data2.iloc[:,-1]
target.head()

0    3.45
1    3.68
2    5.01
3    5.44
4    3.61
Name: Evp(mm), dtype: float64

## Data Checkpoint

In [68]:
# data checkpoint
input_data = data.copy()

## Standardlisation of data

In [69]:
# standardlisation of data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaled_input = scaler.fit_transform(input_data)

## Shuffling the data

In [82]:
shuffled_indices = np.arange(scaled_input.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_input[shuffled_indices]
shuffled_targets = target[shuffled_indices]


## Data split

In [93]:
sample_count = shuffled_indices.shape[0]

train_sample_count = int(0.8 * sample_count)
validation_sample_count = int(0.1 * sample_count)
test_sample_count = int(0.1 * sample_count)

In [94]:
train_input = shuffled_inputs[:train_sample_count]
train_targets = shuffled_targets[:train_sample_count]

  train_targets = shuffled_targets[:train_sample_count]


In [95]:
validation_input = shuffled_inputs[train_sample_count:train_sample_count+validation_sample_count]
validation_target = shuffled_targets[train_sample_count:train_sample_count+validation_sample_count]

  validation_target = shuffled_targets[train_sample_count:train_sample_count+validation_sample_count]


In [96]:
test_input = shuffled_inputs[train_sample_count+validation_sample_count:validation_sample_count:]
test_target = shuffled_targets[train_sample_count+validation_sample_count:]

  test_target = shuffled_targets[train_sample_count+validation_sample_count:]


In [97]:
print(np.sum(train_targets), train_sample_count, np.sum(train_targets) / train_sample_count)
print(np.sum(validation_target), validation_sample_count, np.sum(validation_target) / validation_sample_count)
print(np.sum(test_target), test_sample_count, np.sum(test_target) / test_sample_count)

82.48000000000002 17 4.851764705882354
15.43 2 7.715
15.93 2 7.965


## Save datasets in *.npz

In [99]:
np.savez('seawater_data_train', inputs=train_input, targets=train_targets)
np.savez('seawater_data_validation', inputs=validation_input, targets=validation_target)
np.savez('seawater_data_test', inputs=test_input, targets=test_target)