Data Challenge : Can you predict the tide ?
Author : Oumeima EL GHARBI
Date : November 2022

# PART 1 : Preprocessing

### Importing libraries

In [1]:
import pandas as pd
%reset -f

from functions import *
from surge_prediction_metric import *

from sklearn.model_selection import train_test_split
from sklearn.neighbors import BallTree
from sklearn.multioutput import MultiOutputRegressor
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor


import warnings
warnings.filterwarnings('ignore')
#warnings.filterwarnings(action="once")

%matplotlib inline
%autosave 300

  from pandas import MultiIndex, Int64Index


Autosaving every 300 seconds


In [2]:
w = np.linspace(1, 0.1, 10)[np.newaxis]
w

array([[1. , 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]])

In [3]:
# Starting time
t0 = time()

seed = 42 #42
baseline_error = 0.7720 # 0.62 SVR

#### Loading dataset

In [4]:
# load raw dataset
input_path = "./dataset/source/"
output_path = "./dataset/output/"


X_train_filename = "X_train_surge_new.npz"
Y_train_filename = "Y_train_surge.csv"
X_test_filename = "X_test_surge_new.npz"

X_train_file = "{}{}".format(input_path, X_train_filename)
Y_train_file = "{}{}".format(input_path, Y_train_filename)
X_test_file = "{}{}".format(input_path, X_test_filename)

In [5]:
X_train = np.load(X_train_file)
Y_train = pd.read_csv(Y_train_file)
X_test = np.load(X_test_file)

In [6]:
X_train

<numpy.lib.npyio.NpzFile at 0x225f8d8baf0>

In [7]:
surge_train = np.array(Y_train)[:,1:]

In [8]:
surge_train

array([[ 0.58693592,  1.06958024,  0.76792754, ..., -0.42270688,
        -0.45623606, -0.82505705],
       [ 0.76792754, -0.1001619 ,  0.07077463, ..., -0.82505705,
        -0.99270295, -0.99270295],
       [ 0.07077463, -0.24428486, -0.35489084, ..., -0.99270295,
        -0.32211934, -0.88373311],
       ...,
       [ 0.48303332,  0.53330877,  1.44832196, ...,  1.22022298,
         1.44654495,  2.67036005],
       [ 1.44832196,  1.82036029,  1.66283054, ...,  2.67036005,
         2.62006628,  2.67874235],
       [ 1.66283054,  1.52876268,  1.28408882, ...,  2.67874235,
         1.22022298,  0.72566756]])

In [9]:
np.array(X_train)

array(['id_sequence', 't_slp', 'slp', 't_surge1_input', 'surge1_input',
       't_surge2_input', 'surge2_input', 't_surge1_output',
       't_surge2_output'], dtype='<U15')

In [10]:
pd.DataFrame(X_train)#.shape

Unnamed: 0,0
0,id_sequence
1,t_slp
2,slp
3,t_surge1_input
4,surge1_input
5,t_surge2_input
6,surge2_input
7,t_surge1_output
8,t_surge2_output


In [11]:
X_train['slp'].shape

(5599, 40, 41, 41)

#### Making X or slp_train for X_train and X_test

##### Preparing X_train to train

In [12]:
slp_train = generate_X_train_test(X_train)

slp_train

array([[100849.72, 100797.72, 100754.72, ..., 101970.81, 101981.81,
        101990.81],
       [100940.37, 100881.37, 100823.37, ..., 101515.72, 101562.72,
        101611.72],
       [101862.91, 101879.91, 101894.91, ..., 101005.37, 101038.37,
        101076.37],
       ...,
       [102048.13, 102080.13, 102106.13, ..., 100781.81, 100724.81,
        100664.81],
       [102111.95, 102134.95, 102149.95, ..., 101352.13, 101376.13,
        101392.13],
       [101972.53, 101984.53, 101993.53, ..., 100412.28, 100450.28,
        100483.28]], dtype=float32)

##### Preparing X_test for the submission

In [13]:
slp_test = generate_X_train_test(X_test)
X_test['id_sequence']

array([5600, 5601, 5602, 5603, 5604, 5605, 5606, 5607, 5608, 5609, 5610,
       5611, 5612, 5613, 5614, 5615, 5616, 5617, 5618, 5619, 5620, 5621,
       5622, 5623, 5624, 5625, 5626, 5627, 5628, 5629, 5630, 5631, 5632,
       5633, 5634, 5635, 5636, 5637, 5638, 5639, 5640, 5641, 5642, 5643,
       5644, 5645, 5646, 5647, 5648, 5649, 5650, 5651, 5652, 5653, 5654,
       5655, 5656, 5657, 5658, 5659, 5660, 5661, 5662, 5663, 5664, 5665,
       5666, 5667, 5668, 5669, 5670, 5671, 5672, 5673, 5674, 5675, 5676,
       5677, 5678, 5679, 5680, 5681, 5682, 5683, 5684, 5685, 5686, 5687,
       5688, 5689, 5690, 5691, 5692, 5693, 5694, 5695, 5696, 5697, 5698,
       5699, 5700, 5701, 5702, 5703, 5704, 5705, 5706, 5707, 5708, 5709,
       5710, 5711, 5712, 5713, 5714, 5715, 5716, 5717, 5718, 5719, 5720,
       5721, 5722, 5723, 5724, 5725, 5726, 5727, 5728, 5729, 5730, 5731,
       5732, 5733, 5734, 5735, 5736, 5737, 5738, 5739, 5740, 5741, 5742,
       5743, 5744, 5745, 5746, 5747, 5748, 5749, 57

#### Making y or surge_train / Y_train

In [14]:
surge_train = generate_Y_train(Y_train)
surge_train

array([[ 0.58693592,  1.06958024,  0.76792754, ..., -0.42270688,
        -0.45623606, -0.82505705],
       [ 0.76792754, -0.1001619 ,  0.07077463, ..., -0.82505705,
        -0.99270295, -0.99270295],
       [ 0.07077463, -0.24428486, -0.35489084, ..., -0.99270295,
        -0.32211934, -0.88373311],
       ...,
       [ 0.48303332,  0.53330877,  1.44832196, ...,  1.22022298,
         1.44654495,  2.67036005],
       [ 1.44832196,  1.82036029,  1.66283054, ...,  2.67036005,
         2.62006628,  2.67874235],
       [ 1.66283054,  1.52876268,  1.28408882, ...,  2.67874235,
         1.22022298,  0.72566756]])

In [15]:
surge_train_df = Y_train_to_dataframe(surge_train, X_train)
surge_train_df

Unnamed: 0,surge1_t0,surge1_t1,surge1_t2,surge1_t3,surge1_t4,surge1_t5,surge1_t6,surge1_t7,surge1_t8,surge1_t9,surge2_t0,surge2_t1,surge2_t2,surge2_t3,surge2_t4,surge2_t5,surge2_t6,surge2_t7,surge2_t8,surge2_t9
1,0.586936,1.069580,0.767928,-0.100162,0.070775,-0.244285,-0.354891,-0.928031,-0.773853,-0.375001,-0.053886,0.356847,0.348464,0.264641,0.901696,0.449052,0.113760,-0.422707,-0.456236,-0.825057
2,0.767928,-0.100162,0.070775,-0.244285,-0.354891,-0.928031,-0.773853,-0.375001,-0.361594,-0.210768,0.348464,0.264641,0.901696,0.449052,0.113760,-0.422707,-0.456236,-0.825057,-0.992703,-0.992703
3,0.070775,-0.244285,-0.354891,-0.928031,-0.773853,-0.375001,-0.361594,-0.210768,0.288635,-0.726929,0.901696,0.449052,0.113760,-0.422707,-0.456236,-0.825057,-0.992703,-0.992703,-0.322119,-0.883733
4,-0.354891,-0.928031,-0.773853,-0.375001,-0.361594,-0.210768,0.288635,-0.726929,-0.576103,1.160076,0.113760,-0.422707,-0.456236,-0.825057,-0.992703,-0.992703,-0.322119,-0.883733,-0.473001,-0.422707
5,-0.773853,-0.375001,-0.361594,-0.210768,0.288635,-0.726929,-0.576103,1.160076,0.442813,0.305393,-0.456236,-0.825057,-0.992703,-0.992703,-0.322119,-0.883733,-0.473001,-0.422707,-0.473001,-0.531677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5595,1.451674,1.220407,0.556771,1.709754,0.483033,0.533309,1.448322,1.820360,1.662831,1.528763,0.759197,0.884931,0.298171,-0.397560,-0.540059,-0.498148,-0.498148,-0.405942,0.164054,0.331700
5596,0.556771,1.709754,0.483033,0.533309,1.448322,1.820360,1.662831,1.528763,1.284089,0.070775,0.298171,-0.397560,-0.540059,-0.498148,-0.498148,-0.405942,0.164054,0.331700,1.052577,1.220223
5597,0.483033,0.533309,1.448322,1.820360,1.662831,1.528763,1.284089,0.070775,0.888589,1.545521,-0.540059,-0.498148,-0.498148,-0.405942,0.164054,0.331700,1.052577,1.220223,1.446545,2.670360
5598,1.448322,1.820360,1.662831,1.528763,1.284089,0.070775,0.888589,1.545521,1.166779,0.653970,-0.498148,-0.405942,0.164054,0.331700,1.052577,1.220223,1.446545,2.670360,2.620066,2.678742


In [16]:
slp_train.shape

(5599, 3362)

In [17]:
slp_train_df = slp_train_to_dataframe(slp_train)

#### Splitting train / test set using X_train and Y_train

In [18]:
X_train_surge, X_val_surge, y_train_surge, y_val_surge = train_test_split(slp_train_df, surge_train_df, test_size=0.3, random_state=seed)

In [19]:
slp_train.shape

(5599, 3362)

In [20]:
surge_train.shape

(5599, 20)

In [21]:
X_train_surge.shape

(3919, 3362)

In [22]:
y_val_surge.shape

(1680, 20)

In [23]:
X_train_surge

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3352,3353,3354,3355,3356,3357,3358,3359,3360,3361
5568,102176.351562,102156.351562,102133.351562,102105.351562,102071.351562,102031.351562,101986.351562,101936.351562,101880.351562,101821.351562,...,101417.007812,101455.007812,101493.007812,101527.007812,101556.007812,101585.007812,101613.007812,101640.007812,101663.007812,101685.007812
2836,101805.570312,101868.570312,101921.570312,101968.570312,102012.570312,102051.570312,102078.570312,102093.570312,102096.570312,102090.570312,...,100549.203125,100531.203125,100512.203125,100486.203125,100454.203125,100419.203125,100385.203125,100350.203125,100314.203125,100281.203125
83,102114.101562,102082.101562,102049.101562,102013.101562,101976.101562,101937.101562,101895.101562,101850.101562,101804.101562,101754.101562,...,101463.242188,101429.242188,101394.242188,101357.242188,101318.242188,101282.242188,101249.242188,101215.242188,101180.242188,101145.242188
3658,102364.898438,102332.898438,102299.898438,102262.898438,102223.898438,102182.898438,102138.898438,102091.898438,102040.898438,101987.898438,...,101510.390625,101544.390625,101572.390625,101592.390625,101603.390625,101609.390625,101611.390625,101608.390625,101600.390625,101590.390625
790,102082.367188,102077.367188,102069.367188,102056.367188,102041.367188,102023.367188,102003.367188,101983.367188,101960.367188,101936.367188,...,102172.421875,102084.421875,101991.421875,101886.421875,101771.421875,101650.421875,101528.421875,101401.421875,101272.421875,101145.421875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,102028.187500,102006.187500,101988.187500,101972.187500,101957.187500,101943.187500,101928.187500,101911.187500,101891.187500,101868.187500,...,101873.476562,101871.476562,101870.476562,101867.476562,101863.476562,101859.476562,101854.476562,101850.476562,101846.476562,101839.476562
5191,101813.210938,101799.210938,101784.210938,101765.210938,101740.210938,101710.210938,101673.210938,101629.210938,101579.210938,101526.210938,...,100480.117188,100622.117188,100769.117188,100912.117188,101051.117188,101187.117188,101319.117188,101446.117188,101563.117188,101670.117188
5226,101786.117188,101687.117188,101576.117188,101462.117188,101375.117188,101357.117188,101414.117188,101496.117188,101565.117188,101606.117188,...,99715.796875,99775.796875,99838.796875,99908.796875,99983.796875,100064.796875,100150.796875,100236.796875,100318.796875,100393.796875
5390,102499.406250,102499.406250,102497.406250,102493.406250,102486.406250,102475.406250,102459.406250,102439.406250,102415.406250,102386.406250,...,100350.492188,100351.492188,100362.492188,100378.492188,100398.492188,100422.492188,100448.492188,100472.492188,100492.492188,100510.492188


In [24]:
y_val_surge

Unnamed: 0,surge1_t0,surge1_t1,surge1_t2,surge1_t3,surge1_t4,surge1_t5,surge1_t6,surge1_t7,surge1_t8,surge1_t9,surge2_t0,surge2_t1,surge2_t2,surge2_t3,surge2_t4,surge2_t5,surge2_t6,surge2_t7,surge2_t8,surge2_t9
4441,-0.167196,-0.234230,-0.110217,-0.029776,0.047313,-0.046535,-0.354891,-0.405166,-0.314670,-0.690061,-0.104180,-0.162856,-0.104180,-0.548441,-0.783146,-0.481383,-0.774763,-0.422707,-0.296972,0.398758
2472,0.238359,2.105255,0.888589,1.062877,1.103097,-1.638591,-1.145891,-0.321374,-0.579455,-0.666599,-1.336377,-0.967556,-0.892115,-0.011974,-0.095797,0.105378,-0.095797,-0.162856,0.113760,0.331700
3317,-0.096810,-0.100162,-0.103514,0.265173,-0.244285,-0.321374,-0.552641,-0.579455,-0.646488,-0.110217,1.002283,0.960372,0.893313,0.943607,0.700521,0.457434,0.365229,0.608315,0.289788,0.390376
81,1.331013,0.553419,1.220407,0.637211,-0.013018,-0.150437,0.141160,0.198139,0.060720,-0.371649,-0.925645,-1.403435,-1.252554,-1.420200,-1.026232,-0.833439,-0.045504,-0.028739,-0.347266,-0.975938
4806,1.776788,3.053785,3.516319,2.557734,0.922106,2.068386,1.713106,1.153373,2.406907,1.947725,0.901696,1.236988,0.717285,0.650227,0.708903,0.356847,0.474199,0.222730,0.247877,0.055084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392,-1.440841,-1.135836,-0.760446,-0.747039,-0.271098,-0.499014,-0.539234,-0.696764,-0.874404,-0.901217,1.295664,1.589044,1.228605,0.273024,-0.305355,-0.355649,-0.716087,-1.051379,-0.489765,-0.188003
4351,-0.116920,-0.328077,-0.371649,-0.455442,-0.455442,-0.499014,-0.143734,-0.009666,-0.096810,-0.485607,-0.364031,-0.322119,-0.489765,-0.791528,-0.556824,-0.690940,-0.422707,-0.674176,-0.196385,-0.313737
221,-1.420730,-1.561502,-1.702273,-1.826286,-1.779362,-1.283311,-1.249794,-0.921328,-0.991713,-0.777205,-0.380795,0.843020,-0.028739,-0.120944,-0.540059,-0.313737,-1.219025,-1.235790,-1.587846,-1.638140
1746,-0.163844,-0.458793,-0.006314,-0.090107,-0.250988,-0.368298,-0.214120,0.208194,0.074126,-0.056590,0.423905,0.524492,0.750814,0.532875,1.060959,0.641845,0.708903,0.775961,0.616698,0.474199


In [25]:
pd.DataFrame(X_train["id_sequence"])

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
...,...
5594,5595
5595,5596
5596,5597
5597,5598


In [26]:
pd.DataFrame(Y_train)

Unnamed: 0,id_sequence,surge1_t0,surge1_t1,surge1_t2,surge1_t3,surge1_t4,surge1_t5,surge1_t6,surge1_t7,surge1_t8,...,surge2_t0,surge2_t1,surge2_t2,surge2_t3,surge2_t4,surge2_t5,surge2_t6,surge2_t7,surge2_t8,surge2_t9
0,1,0.586936,1.069580,0.767928,-0.100162,0.070775,-0.244285,-0.354891,-0.928031,-0.773853,...,-0.053886,0.356847,0.348464,0.264641,0.901696,0.449052,0.113760,-0.422707,-0.456236,-0.825057
1,2,0.767928,-0.100162,0.070775,-0.244285,-0.354891,-0.928031,-0.773853,-0.375001,-0.361594,...,0.348464,0.264641,0.901696,0.449052,0.113760,-0.422707,-0.456236,-0.825057,-0.992703,-0.992703
2,3,0.070775,-0.244285,-0.354891,-0.928031,-0.773853,-0.375001,-0.361594,-0.210768,0.288635,...,0.901696,0.449052,0.113760,-0.422707,-0.456236,-0.825057,-0.992703,-0.992703,-0.322119,-0.883733
3,4,-0.354891,-0.928031,-0.773853,-0.375001,-0.361594,-0.210768,0.288635,-0.726929,-0.576103,...,0.113760,-0.422707,-0.456236,-0.825057,-0.992703,-0.992703,-0.322119,-0.883733,-0.473001,-0.422707
4,5,-0.773853,-0.375001,-0.361594,-0.210768,0.288635,-0.726929,-0.576103,1.160076,0.442813,...,-0.456236,-0.825057,-0.992703,-0.992703,-0.322119,-0.883733,-0.473001,-0.422707,-0.473001,-0.531677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5594,5595,1.451674,1.220407,0.556771,1.709754,0.483033,0.533309,1.448322,1.820360,1.662831,...,0.759197,0.884931,0.298171,-0.397560,-0.540059,-0.498148,-0.498148,-0.405942,0.164054,0.331700
5595,5596,0.556771,1.709754,0.483033,0.533309,1.448322,1.820360,1.662831,1.528763,1.284089,...,0.298171,-0.397560,-0.540059,-0.498148,-0.498148,-0.405942,0.164054,0.331700,1.052577,1.220223
5596,5597,0.483033,0.533309,1.448322,1.820360,1.662831,1.528763,1.284089,0.070775,0.888589,...,-0.540059,-0.498148,-0.498148,-0.405942,0.164054,0.331700,1.052577,1.220223,1.446545,2.670360
5597,5598,1.448322,1.820360,1.662831,1.528763,1.284089,0.070775,0.888589,1.545521,1.166779,...,-0.498148,-0.405942,0.164054,0.331700,1.052577,1.220223,1.446545,2.670360,2.620066,2.678742


In [27]:
X_train_surge.shape

(3919, 3362)

In [28]:
y_train_surge.shape

(3919, 20)

#### Saving testing set

In [29]:
y_val_surge.to_csv(output_path + "Y_val_true.csv", index_label='id_sequence', sep=',')

In [31]:
y_val_surge.shape

(1680, 20)

###  End of notebook

In [30]:
# End of pipeline time
t1 = time()
print("computing time : {:8.6f} sec".format(t1 - t0))
print("computing time : " + strftime('%H:%M:%S', gmtime(t1 - t0)))

computing time : 6.063065 sec
computing time : 00:00:06
