In [12]:
import os
import datetime
import sys

sys.path.append("..")


import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from data.create_datasets import WindowGenerator

mpl.rcParams['figure.figsize'] = (15, 10)
mpl.rcParams['axes.grid'] = False

In [6]:
df = pd.read_csv('../datasets/df_feat_enged')

In [182]:
# split data
n = len(df)
train_df = df[0:int(n*0.9)]
test_df = df[int(n*0.9):]

In [183]:
# Normalize data
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

In [184]:
train_df.head()

Unnamed: 0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),Wx,Wy,max Wx,max Wy,Day sin,Day cos,Year sin,Year cos
0,0.895312,-2.041187,-2.098064,-1.993393,1.109555,-1.308313,-1.500978,-0.780463,-1.502881,-1.505733,2.273012,0.216224,0.238134,0.134965,0.233763,-8.6e-05,1.414463,-0.008544,1.381134
1,0.909606,-2.139495,-2.196721,-2.139408,1.03649,-1.33573,-1.558487,-0.776308,-1.559503,-1.562546,2.382824,0.195836,0.239078,0.133287,0.243661,0.365955,1.366278,-0.007527,1.381125
2,0.935811,-2.131204,-2.190848,-2.123184,1.054756,-1.334424,-1.551299,-0.778386,-1.551954,-1.555444,2.381073,0.230773,0.293458,0.135042,0.3402,0.707052,1.225006,-0.006509,1.381114
3,0.953678,-2.15963,-2.22021,-2.17628,0.999957,-1.342257,-1.570468,-0.772154,-1.570828,-1.576749,2.416843,0.29303,0.212137,0.270407,0.160815,0.999958,1.000275,-0.005492,1.381103
4,1.009661,-2.228327,-2.293028,-2.269198,0.975603,-1.359229,-1.604015,-0.772154,-1.604802,-1.60989,2.506394,0.135214,0.368306,0.072614,0.418385,1.224713,0.707399,-0.004474,1.381091


In [190]:
window = WindowGenerator(
    label_width=15,
    input_width=24,
    feature_columns=["p (mbar)", "T (degC)", "VPdef (mbar)"],
    label_columns=["p (mbar)", "T (degC)", "VPdef (mbar)"],
    shift=15,
)

In [191]:
inputs_train, labels_train = window.create_dataset(train_df)
inputs_test, labels_test = window.create_dataset(test_df)

In [192]:
from sklearn.linear_model import LinearRegression
from sklearn import datasets, ensemble
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

In [195]:
## Reshaping datasets:

train_dim1 = inputs_train.shape[0]
test_dim1 = inputs_test.shape[0]

In [196]:
inputs_train, labels_train = np.reshape(inputs_train, (train_dim1, -1)), np.reshape(labels_train, (train_dim1, -1))
inputs_test, labels_test = np.reshape(inputs_test, (test_dim1, -1)), np.reshape(labels_test, (test_dim1, -1))

In [201]:
estimators = [100, 300, 500]
depth = [1, 2, 3]

params_rf = {
    "n_estimators": estimators,
    "max_depth": depth,   
}
nn = [3, 5, 7]
params_kn = {'n_neighbors' : nn }

In [202]:
models = [LinearRegression, KNeighborsRegressor, 
          ensemble.ExtraTreesRegressor]
params = [None, params_kn, params_rf]

In [204]:
scores = {}
clfs = {}
for param, model in zip(params, models):

    model = model()
    if param is None:
        clf = model 
    else: 
        clf = GridSearchCV(model, param,cv=5,
                          scoring="neg_mean_squared_error")
        
    clf.fit(inputs_train, labels_train)
    
    clfs[str(model)] = clf    
    scores[str(model)] = mean_squared_error(
    labels_test.flatten(), clf.predict(inputs_test).flatten()), 
    mean_absolute_error(
    labels_test.flatten(), clf.predict(inputs_test).flatten())
    



None <class 'sklearn.linear_model._base.LinearRegression'>
{'n_neighbors': [3, 5, 7]} <class 'sklearn.neighbors._regression.KNeighborsRegressor'>
{'n_estimators': [100, 300, 500], 'max_depth': [1, 2, 3]} <class 'sklearn.ensemble._forest.ExtraTreesRegressor'>


In [217]:
clfs['ExtraTreesRegressor()'].best_estimator_

ExtraTreesRegressor(max_depth=3, n_estimators=500)