# Univariate Modelling

In [1]:
import os
import pandas as pd

data_file = os.path.join('..', '..','..', '..', 'data', 'interim', 'all_train.csv')
df = pd.read_csv(data_file)
df.head()

Unnamed: 0.1,Unnamed: 0,p_num,days_since_start,time,initial_resolution,bg,insulin,carbs,hr,steps,cals,activity,bg+1:00
0,2020-01-01 00:15:00,p01,0,00:15:00,15min,,0.0083,,,,,,
1,2020-01-01 00:20:00,p01,0,00:20:00,15min,,0.0083,,,,,,
2,2020-01-01 00:25:00,p01,0,00:25:00,15min,9.6,0.0083,,,,,,
3,2020-01-01 00:30:00,p01,0,00:30:00,15min,,0.0083,,,,,,
4,2020-01-01 00:35:00,p01,0,00:35:00,15min,,0.0083,,,,,,


# Data Preprocessing

## 1. Select only bg (train) and bg+1:00 (target) columns from dataframe

In [2]:
df = df[['bg', 'bg+1:00']]
df.head()

Unnamed: 0,bg,bg+1:00
0,,
1,,
2,9.6,
3,,
4,,


# Clean Data

## Interpolate missing values in bg column and drop rows with missing values

In [3]:
df['bg'] = df['bg'].interpolate(method='linear').ffill().bfill()
df = df.dropna()
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 177024 entries, 71 to 235126
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   bg       177024 non-null  float64
 1   bg+1:00  177024 non-null  float64
dtypes: float64(2)
memory usage: 4.1 MB


# Model selection

## 1. Split the data into train and test sets

In [4]:
from sklearn.model_selection import train_test_split

X = df[['bg']]
y = df['bg+1:00']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Use LazyPredict

In [None]:
from notebooks.helpers.LazyPredict import get_lazy_regressor

reg = get_lazy_regressor()
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 69%|██████▉   | 27/39 [00:27<00:07,  1.58it/s]

We choose: 

* LGBMRegressor
* XGBRegressor
* LasseCV

## Save the train data for later

In [15]:
# Save the train data
df.to_csv('train_data.csv', index=False)