In [37]:
# importing libraries
import numpy as np  # For numerical computation
import pandas as pd  # Data manipulation
import seaborn as sns  # plotting
import scipy.io  # reading matlab files in python
from scipy import signal  #signal processing
from scipy.fftpack import fft, dct  #signal processing

from sklearn.linear_model import LinearRegression  #linear regression model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split  # cross validation split
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt  # For plotting graphs(Visualization)

import os  # system-wide functions

os.listdir('./kaggle_data')

['part_1.mat',
 'part_10.mat',
 'part_11.mat',
 'part_12.mat',
 'part_2.mat',
 'part_3.mat',
 'part_4.mat',
 'part_5.mat',
 'part_6.mat',
 'part_7.mat',
 'part_8.mat',
 'part_9.mat',
 'Samples']

### Data Preparation

In [38]:
# defining our evaluation error function
def rmse(y_true, y_pred):
    """Computes the Root Mean Squared Error (RMSE)."""
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [39]:
sample_file = scipy.io.loadmat(f'./kaggle_data/part_{1}.mat')
print(f'sample_file Data type: {type(sample_file)}')
print(f'sample_file keys:\n{sample_file.keys()}')


sample_file Data type: <class 'dict'>
sample_file keys:
dict_keys(['__header__', '__version__', '__globals__', 'p'])


In [40]:
# Loading a sample .mat file to understand the data dimensions
test_sample = scipy.io.loadmat(f'./kaggle_data/part_{1}.mat')['p']
print(f'test_sample Data type: {type(test_sample)}')
print(f'test_sample shape/dimensions: {test_sample.shape}')

test_sample Data type: <class 'numpy.ndarray'>
test_sample shape/dimensions: (1, 1000)


In [41]:
print(f"Total Samples: {len(test_sample[0])}")
print(f"Number of readings in each sample(column): {len(test_sample[0][0])}")
print(f"Number of samples in each reading(ECG): {len(test_sample[0][0][2])}")

temp_mat = test_sample[0, 999]
temp_length = temp_mat.shape[1]
sample_size = 125

print(temp_length)
print((int)(temp_length / sample_size))


Total Samples: 1000
Number of readings in each sample(column): 3
Number of samples in each reading(ECG): 61000
9000
72


In [42]:
sample_size = 125
ppg = []
for i in range(1000):
    temp_mat = test_sample[0, i]
    temp_length = temp_mat.shape[1]
    for j in range((int)(temp_length / sample_size)):
        temp_ppg = temp_mat[0, j * sample_size:(j + 1) * sample_size]
        ppg.append(temp_ppg)


In [43]:
ecg = []
bp = []
sbp = []  #Systolic Blood Pressure
dbp = []  #Diastolic Blood Pressue
size = 125  #sample size

for i in range(1000):
    temp_mat = test_sample[0, i]
    temp_length = temp_mat.shape[1]
    for j in range((int)(temp_length / sample_size)):
        temp_ecg = temp_mat[2, j * size:(j + 1) * size]
        temp_bp = temp_mat[1, j * size:(j + 1) * size]

        max_value = max(temp_bp)
        min_value = min(temp_bp)

        sbp.append(max_value)
        dbp.append(min_value)
        ecg.append(temp_ecg)
        bp.append(temp_bp)


In [44]:
# Reshaping the ecg, ppg and bp signal data into column vectors
ppg, ecg, bp = np.array(ppg).reshape(-1,1), np.array(ecg).reshape(-1,1), np.array(bp).reshape(-1,1)
sbp, dbp = np.array(sbp).reshape(-1,1), np.array(dbp).reshape(-1,1)
print(f'PPG_shape: {ppg.shape}\n ECG_shape: {ecg.shape}\n BP_shape: {bp.shape}')
print(f'Systolic-BP_shape: {sbp.shape},\n Diastolic-BP_shape: {dbp.shape}')


PPG_shape: (32061000, 1)
 ECG_shape: (32061000, 1)
 BP_shape: (32061000, 1)
Systolic-BP_shape: (256488, 1),
 Diastolic-BP_shape: (256488, 1)


## Pycaret Time Series

In [45]:
data = pd.DataFrame(np.concatenate((ppg, bp), axis=1))
data.columns = ['PPG', 'BP']

In [46]:
data

Unnamed: 0,PPG,BP
0,1.759531,67.062955
1,1.718475,69.358628
2,1.684262,75.366453
3,1.657869,85.037586
4,1.637341,96.222885
...,...,...
32060995,1.413490,123.135987
32060996,1.400782,120.889158
32060997,1.386119,118.495797
32060998,1.368524,116.053592


In [47]:
# split data into train-test set of 70 and 30 percent respectively
train = data[data.index < 1000]
test = data[data.index >= 32060900]

# check shape
train.shape, test.shape

((1000, 2), (100, 2))

In [48]:
import pycaret
import sklearn
import pandas as pd
import numpy as np

print(pycaret.__version__)
print(sklearn.__version__)

3.0.0.rc3
1.1.2


In [49]:
from pycaret.time_series import TSForecastingExperiment
exp = TSForecastingExperiment()


In [50]:
# Global Figure Settings for notebook ----
global_fig_settings = {"renderer": "notebook", "width": 1000, "height": 600}

### Compare Models

In [51]:
exp.setup(data=train,
          target='BP',
          seasonal_period=1,
          use_gpu=True,
          session_id=42)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,BP
2,Approach,Univariate
3,Exogenous Variables,Present
4,Original data shape,"(1000, 2)"
5,Transformed data shape,"(1000, 2)"
6,Transformed train set shape,"(999, 2)"
7,Transformed test set shape,"(1, 2)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


<pycaret.time_series.forecasting.oop.TSForecastingExperiment at 0x1da43826280>

In [52]:
best = exp.compare_models()


In [53]:
# Save the results from the experiment
try:
    best.to_csv('best.csv')
except:
    try:
        with open('best.csv', 'w') as f:
            f.write(best)
    except:
        print('Failed to save the results')


Failed to save the results


In [55]:
print(best)

[]


In [56]:
exp.plot_model(plot = 'auc', save=True)


TypeError: 'NoneType' object is not subscriptable

In [None]:
exp.plot_model(plot = 'confusion_matrix', save=True)

In [None]:
exp.plot_model(plot='vc', save=True)

In [None]:
# interpret model
exp.interpret_model(plot = 'correlation', save=True)

In [None]:
# Predict on test / hold-out Sample
predict_holdout = exp.predict_model(best, data=test)

# Save the predictions in a csv file
predict_holdout.to_csv('predictions.csv', index=False)

In [None]:
# Save the model with name from exp.compare_models() top row
exp.save_model(best, 'best_model')

### AutoML for Time Series

In [None]:
# setup the experiment
FH = 48
metric = "mase"
exclude = ["auto_arima", "bats", "tbats", "lar_cds_dt", "par_cds_dt"]


In [None]:
exp_auto = TSForecastingExperiment()

# enforce_exogenous=False --> Use multivariate forecasting when model supports it, else use univariate forecasting
exp_auto.setup(data=train,
               target='BP',
               fh=FH,
               seasonal_period=125,
               use_gpu=True,
               enforce_exogenous=False,
               numeric_imputation_target="ffill",
               numeric_imputation_exogenous="ffill",
               fig_kwargs=global_fig_settings,
               session_id=42)


In [None]:
# Include slower models like Prophet (turbo=False), but exclude some specific models ----
best_auto = exp_auto.compare_models(sort=metric, turbo=False, exclude=exclude)

In [None]:
# Save the results from the experiment
try:
    best_auto.to_csv('best.csv')
except:
    try:
        with open('best.csv', 'w') as f:
            f.write(best_auto)
    except:
        print('Failed to save the results')

In [None]:
final_auto_model = exp_auto.finalize_model(best_auto)


In [None]:
def safe_predict(exp, model):
    """Prediction wrapper for demo purposes."""
    try:
        future_preds = exp.predict_model(model)
    except ValueError as exception:
        print(exception)
        exog_vars = exp.exogenous_variables
        print(
            f"{len(exog_vars)} exogenous variables (X) needed in order to make future predictions:\n{exog_vars}"
        )

        exog_exps = []
        exog_models = []
        for exog_var in exog_vars:
            exog_exp = TSForecastingExperiment()
            exog_exp.setup(data=data[exog_var],
                           fh=FH,
                           numeric_imputation_target="ffill",
                           numeric_imputation_exogenous="ffill",
                           fig_kwargs=global_fig_settings,
                           session_id=42)

            # Users can customize how to model future exogenous variables i.e. add
            # more steps and models to potentially get better models at the expense
            # of higher modeling time.
            best = exog_exp.compare_models(sort=metric,
                                           include=[
                                               "arima",
                                               "ets",
                                               "exp_smooth",
                                               "theta",
                                               "lightgbm_cds_dt",
                                           ])
            final_exog_model = exog_exp.finalize_model(best)

            exog_exps.append(exog_exp)
            exog_models.append(final_exog_model)

        # Step 2: Get future predictions for exog variables ----
        future_exog = [
            exog_exp.predict_model(exog_model)
            for exog_exp, exog_model in zip(exog_exps, exog_models)
        ]
        future_exog = pd.concat(future_exog, axis=1)
        future_exog.columns = exog_vars

        future_preds = exp.predict_model(model, X=future_exog)

    return future_preds


In [None]:
future_preds = safe_predict(exp_auto, final_auto_model)
future_preds.plot()


In [None]:
exp.plot_model(plot = 'auc', save=True)
exp.plot_model(plot = 'confusion_matrix', save=True)
exp.plot_model(plot = 'vc', save=True)

In [None]:
# Save final_auto_model
exp_auto.save_model(final_auto_model, "final_auto_model")