In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [2]:
def dataframe_read_shuffle(path):
    dataframe = pd.read_csv(path,sep=",")
    np.random.seed(83)
    dataframe = dataframe.reindex(np.random.permutation(dataframe.index))
    return dataframe

def dataframe_splitter(dataframe):
    train_set = dataframe.sample(frac=0.7, random_state=200)
    remainframe = dataframe.drop(train_set.index)
    validate_set = remainframe.sample(frac=0.5, random_state=200)
    test_set = remainframe.drop(validate_set.index)
    return train_set,validate_set,test_set

def features_target_splitter(dataset, features, target):
    X = dataset[features]
    y = dataset[target]
    return X,y

In [3]:
def train_model(model, train_features, train_target):
    model.fit(train_features, train_target)

In [4]:
def validate_model(model,train_features,train_target,validate_features,validate_target):
    rscore_train = model.score(train_features,train_target)
    rscore = model.score(validate_features,validate_target)
    predictions = model.predict(validate_features)
    mse = mean_squared_error(validate_target,predictions)
    rmse = np.sqrt(mse)
    return rscore_train,rscore,rmse

In [5]:
dataframe = dataframe_read_shuffle("dataset/wave-data.csv")

In [4]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43728 entries, 5149 to 34898
Data columns (total 7 columns):
date/time                        43728 non-null object
significant wave height          43728 non-null float64
max wave height                  43728 non-null float64
wave period (zero upcrossing)    43728 non-null float64
wave period (peak energy)        43728 non-null float64
peak direction                   43728 non-null float64
sea surface temp (apx)           43728 non-null float64
dtypes: float64(6), object(1)
memory usage: 2.7+ MB


In [5]:
dataframe.describe()

Unnamed: 0,significant wave height,max wave height,wave period (zero upcrossing),wave period (peak energy),peak direction,sea surface temp (apx)
count,43728.0,43728.0,43728.0,43728.0,43728.0,43728.0
mean,1.04114,1.891779,5.412332,8.794887,97.39531,23.207409
std,4.485922,4.580826,4.739317,5.358859,28.782966,9.813377
min,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9
25%,0.838,1.41,4.974,7.286,85.0,21.9
50%,1.129,1.9,5.525,8.881,99.0,23.95
75%,1.542,2.6,6.162,10.66325,116.0,26.05
max,4.257,7.906,10.921,21.121,358.0,28.65


In [6]:
dataframe.mean()

significant wave height           1.041140
max wave height                   1.891779
wave period (zero upcrossing)     5.412332
wave period (peak energy)         8.794887
peak direction                   97.395310
sea surface temp (apx)           23.207409
dtype: float64

In [7]:
dataframe.head()

Unnamed: 0,date/time,significant wave height,max wave height,wave period (zero upcrossing),wave period (peak energy),peak direction,sea surface temp (apx)
5149,4/18/2017 6:30,1.421,2.14,6.387,14.525,81.0,23.7
22713,4/19/2018 4:30,1.283,2.24,4.759,6.573,115.0,25.15
17254,12/26/2017 11:00,0.538,0.97,4.342,6.751,119.0,25.2
15869,11/27/2017 14:30,1.15,2.07,5.941,8.09,84.0,24.6
37631,23/02/2019 23:30,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9


In [6]:
dataframe = dataframe.replace(-99.9,0)

In [9]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43728 entries, 5149 to 34898
Data columns (total 7 columns):
date/time                        43728 non-null object
significant wave height          43728 non-null float64
max wave height                  43728 non-null float64
wave period (zero upcrossing)    43728 non-null float64
wave period (peak energy)        43728 non-null float64
peak direction                   43728 non-null float64
sea surface temp (apx)           43728 non-null float64
dtypes: float64(6), object(1)
memory usage: 2.7+ MB


In [10]:
dataframe.describe()

Unnamed: 0,significant wave height,max wave height,wave period (zero upcrossing),wave period (peak energy),peak direction,sea surface temp (apx)
count,43728.0,43728.0,43728.0,43728.0,43728.0,43728.0
mean,1.235329,2.085968,5.606521,8.989076,98.01443,23.805969
std,0.530552,0.90088,0.959896,2.42155,25.409106,2.892055
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.838,1.41,4.974,7.286,85.0,21.9
50%,1.129,1.9,5.525,8.881,99.0,23.95
75%,1.542,2.6,6.162,10.66325,116.0,26.05
max,4.257,7.906,10.921,21.121,358.0,28.65


In [11]:
dataframe.head()

Unnamed: 0,date/time,significant wave height,max wave height,wave period (zero upcrossing),wave period (peak energy),peak direction,sea surface temp (apx)
5149,4/18/2017 6:30,1.421,2.14,6.387,14.525,81.0,23.7
22713,4/19/2018 4:30,1.283,2.24,4.759,6.573,115.0,25.15
17254,12/26/2017 11:00,0.538,0.97,4.342,6.751,119.0,25.2
15869,11/27/2017 14:30,1.15,2.07,5.941,8.09,84.0,24.6
37631,23/02/2019 23:30,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
train, validate, test = dataframe_splitter(dataframe)

In [8]:
X_train, y_train = features_target_splitter(train, ["significant wave height", "wave period (zero upcrossing)", "wave period (peak energy)", "peak direction", "sea surface temp (apx)"],["max wave height"])

In [9]:
X_validate, y_validate = features_target_splitter(validate, ["significant wave height", "wave period (zero upcrossing)", "wave period (peak energy)", "peak direction", "sea surface temp (apx)"],["max wave height"])

In [10]:
X_test, y_test = features_target_splitter(test, ["significant wave height", "wave period (zero upcrossing)", "wave period (peak energy)", "peak direction", "sea surface temp (apx)"],["max wave height"])

In [11]:
randfor_reg = RandomForestRegressor(n_estimators=20,random_state=5,n_jobs=1,max_features=4,max_depth=7)

In [17]:
svm_reg = SVR(C=1,gamma=0.1,kernel="rbf",degree=3,epsilon=0.01,max_iter=-1)

In [45]:
nn_reg = MLPRegressor(
    solver="lbfgs",random_state=2,activation="relu",hidden_layer_sizes=[5,3],alpha=1,max_iter=250,
    batch_size=150, learning_rate="constant", learning_rate_init=0.00001
)

In [21]:
train_model(randfor_reg,X_train,y_train)

  


In [22]:
randfor_rscore_train,randfor_rscore,randfor_rmse = validate_model(randfor_reg,X_train,y_train,X_validate,y_validate)
print("RandForest r score training: {:.3f}".format(randfor_rscore_train))
print("RandForest r score: {:.3f}".format(randfor_rscore))
print("RandForest RMSE: {:.4f}".format(randfor_rmse))

RandForest r score training: 0.950
RandForest r score: 0.947
RandForest RMSE: 0.2093


In [18]:
train_model(svm_reg,X_train,y_train)

  y = column_or_1d(y, warn=True)


In [19]:
svm_rscore_train,svm_rscore,svm_rmse = validate_model(svm_reg,X_train,y_train,X_validate,y_validate)
print("SVM r score training: {:.3f}".format(svm_rscore_train))
print("SVM r score: {:.3f}".format(svm_rscore))
print("SVM RMSE: {:.4f}".format(svm_rmse))

SVM r score training: 0.944
SVM r score: 0.936
SVM RMSE: 0.2293


In [46]:
train_model(nn_reg,X_train,y_train)

  y = column_or_1d(y, warn=True)


In [47]:
nn_rscore_train,nn_rscore,nn_rmse = validate_model(nn_reg,X_train,y_train,X_validate,y_validate)
print("NeuralNet r score training: {:.3f}".format(nn_rscore_train))
print("NeuralNet r score: {:.3f}".format(nn_rscore))
print("NeuralNet RMSE: {:.4f}".format(nn_rmse))

NeuralNet r score training: 0.947
NeuralNet r score: 0.947
NeuralNet RMSE: 0.2079


In [30]:
test.head()

Unnamed: 0,date/time,significant wave height,max wave height,wave period (zero upcrossing),wave period (peak energy),peak direction,sea surface temp (apx)
5149,4/18/2017 6:30,1.421,2.14,6.387,14.525,81.0,23.7
17254,12/26/2017 11:00,0.538,0.97,4.342,6.751,119.0,25.2
15869,11/27/2017 14:30,1.15,2.07,5.941,8.09,84.0,24.6
37631,23/02/2019 23:30,0.0,0.0,0.0,0.0,0.0,0.0
30842,10/05/2018 13:00,1.052,1.95,6.823,9.521,98.0,21.95


In [49]:
prediction = randfor_reg.predict([[1.150,5.941,8.090,84.0,24.60]])
print(prediction)

[1.929229]


In [48]:
prediction = nn_reg.predict([[1.150,5.941,8.090,84.0,24.60]])
print(prediction)

[1.93593659]
