In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras import backend
from scipy import stats

In [None]:
np.random.seed(42)

# Prepare data

In [None]:
train_data = pd.read_excel('/kaggle/input/dataset1/data_match9.xlsx', engine='openpyxl')
test_data = pd.read_excel('/kaggle/input/dataset1/data_match10.xlsx', engine='openpyxl')

In [None]:
data = pd.concat([train_data, test_data], ignore_index=True)

In [None]:
data = data[["IMERG", "datetime", "value", "CAPE", "TCWV", "TCC", "TCW", "IRB", "B10B", "B16B", "B11B", "I2B", "WVB", "B09B"]]

In [None]:
def process(dataset):
  dataset = dataset.fillna(dataset.mean())
  dataset['datetime'] = dataset["datetime"].dt.hour
  dataset['T1'] = dataset['B10B'] - dataset['B16B']
  dataset['T2'] = dataset['B11B'] - dataset['IRB']
  dataset['T3'] = dataset['IRB'] - dataset['I2B']
  dataset['T4'] = dataset['WVB'] - dataset['B09B']
  dataset['T5'] = dataset['B09B'] - dataset['B10B']
  dataset.drop('B10B',axis='columns', inplace=True)
  dataset.drop('B16B',axis='columns', inplace=True)
  dataset.drop('B11B',axis='columns', inplace=True)
  dataset.drop('I2B',axis='columns', inplace=True)
  dataset.drop('WVB',axis='columns', inplace=True)
  dataset.drop('B09B',axis='columns', inplace=True)
  return dataset

In [None]:
data = process(data)

In [None]:
time_steps = 6

In [None]:
def check(start, arr):
  point = arr['datetime'][start]
  i = start + 1
  for i in range(start + 1, start + len(arr)):
    if ((arr['datetime'][i] - point) == 1  or ((arr['datetime'][i] - point) == -1)): 
      point = arr['datetime'][i]
    else: 
      return 0
  return 1

In [None]:
def select(dataset):
  result = []
  for i in range(0,len(dataset) - time_steps):
    x = check(i, dataset.iloc[i:i + time_steps])
    if x == 1:
      temp = dataset.iloc[i:i + time_steps].drop('datetime',axis='columns', inplace=False)
      result.append(temp.to_numpy())
  return result

In [None]:
data = select(data)

In [None]:
data = np.array(data)

In [None]:
train_size = int(len(data) * 0.8)
train , test = data[0:train_size,:] , data[train_size:len(data),:]

In [None]:
np.random.shuffle(train)

In [None]:
X_train = []
Y_train = []
IMERG_train = []
for i in range(len(train)):
  Y_train.append(train[i][5][1])
  IMERG_train.append(train[i][5][0])
  df = np.delete(train[i], 0, 1)
  df = df[:5]
  X_train.append(df)

In [None]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
Y_train.reshape(len(Y_train), 1)
IMERG_train = np.array(IMERG_train)
IMERG_train.reshape(len(IMERG_train), 1)

In [None]:
print(X_train.shape, Y_train.shape, IMERG_train.shape)

In [None]:
X_test = []
Y_test = []
IMERG_test = []
for i in range(len(test)):
  Y_test.append(test[i][5][1])
  IMERG_test.append(test[i][5][0])
  df = np.delete(test[i], 0, 1)
  df = df[:5]
  X_test.append(df)

In [None]:
X_test = np.array(X_test)
Y_test = np.array(Y_test)
Y_test.reshape(len(Y_test), 1)
IMERG_test = np.array(IMERG_test)
IMERG_test.reshape(len(IMERG_test), 1)

# Model

In [None]:
from keras.layers import (
    Input,
    Dense,
    LSTM,
    GlobalAveragePooling1D,
    AveragePooling1D,
    TimeDistributed,
    Flatten,
    Bidirectional,
    Dropout,
    Masking,
    Layer,
    BatchNormalization,
    CuDNNLSTM
)
from keras.models import Model
from keras.optimizers import Adam,Nadam
import tensorflow as tf
from keras.optimizers import SGD

In [None]:
def get_model_deep(shape=(5, 11)):
    inp = Input(shape)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(inp)
    x = TimeDistributed(Dense(10))(x)
    x = Flatten()(x)
    x = Dense(1)(x)
    model = Model(inp, x)
    return model

In [None]:
model = get_model_deep((5,11))
model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
history = model.fit(X_train, Y_train, epochs=50, batch_size=64, validation_split=0.14)

# Result

**Plot train loss**

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
import math
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy import stats

In [None]:
def compute_metric(name, Y_true, Y_predict):
    print(f'{name}-R2:', r2_score(Y_true,Y_predict))
    print(f'{name}-RMSE:',mean_squared_error(Y_true, Y_predict, squared=False))
    print(f'{name}-MAE:',mean_absolute_error(Y_true,Y_predict))
    print(f'{name}-Pearson r:', stats.pearsonr(Y_true, Y_predict)[0])

In [None]:
predict_train = model.predict(X_train)

In [None]:
predict_test = model.predict(X_test)

**Compute metric for output train**

In [None]:
compute_metric('Train', Y_train, predict_train)

**Compute metric for IMERG train**

In [None]:
compute_metric('Train', Y_train, IMERG_train)

**Compute metric for output test**

In [None]:
compute_metric('Test', Y_test, predict_test)

**Compute metric for IMERG test**

In [None]:
compute_metric('Test', Y_test, IMERG_test)

**Compare train with groundtruth**

In [None]:
plt.ylabel('Predicted')
plt.xlabel('Label')
plt.title('Train Scatter Plot')
plt.plot(Y_train, predict_train, 'o')
a, b = np.polyfit(Y_train, model.predict(X_train), 1)
plt.plot(Y_train, a * Y_train + b, 'r')
plt.show()

**Compare test with groundtruth**

In [None]:
plt.ylabel('Predicted')
plt.xlabel('Label')
plt.title('Test Scatter Plot')
plt.plot(Y_test, predict_test, 'o')
a, b = np.polyfit(Y_test, predict_test, 1)
plt.plot(Y_test, a * Y_test + b, 'r')
plt.show()

**Compare Y_train with IMERG**

In [None]:
plt.ylabel('IMERG')
plt.xlabel('Label')
plt.title('Train Scatter Plot')
plt.plot(Y_train, IMERG_train, 'o')
a, b = np.polyfit(Y_train, IMERG_train, 1)
plt.plot(Y_train, a * Y_train + b, 'r')
plt.show()

**Compare Y test with IMERG**

In [None]:
plt.ylabel('IMERG')
plt.xlabel('Label')
plt.title('Test Scatter Plot')
plt.plot(Y_test, IMERG_test, 'o')
a, b = np.polyfit(Y_test, IMERG_test, 1)
plt.plot(Y_test, a * Y_test + b, 'r')
plt.show()