In [18]:
# importing library
import pandas as pd
import numpy as np
import pandas as pd
from tensorflow import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from IPython.display import FileLink

In [2]:
# read csv file and view fist five rows
df = pd.read_csv(r"C:\Users\ma086\Downloads\ventilator-pressure-prediction\train.csv")
df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987


## Preparing Data

In [3]:
# calculate rolling window statistics
df['u_in_mean_5'] = df['u_in'].rolling(window=5, min_periods=1).mean()
df['u_in_std_5'] = df['u_in'].rolling(window=5, min_periods=1).std()
df['u_in_max_5'] = df['u_in'].rolling(window=5, min_periods=1).max()
df['u_in_min_5'] = df['u_in'].rolling(window=5, min_periods=1).min()

# calculate time difference statistics
df['time_since_start'] = df['time_step'].cumsum()
df['time_since_breath_start'] = df.groupby('breath_id')['time_step'].cumsum()
df['time_since_last_breath'] = df['time_since_start'] - df.groupby('breath_id')['time_since_start'].shift(1)
df['time_until_next_breath'] = df.groupby('breath_id')['time_since_start'].shift(-1) - df['time_since_start']
df['time_until_end_of_breath'] = df.groupby('breath_id')['time_since_start'].transform(max) - df['time_since_breath_start']

# create lag features
df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)

# fill missing values
df = df.fillna(0)

# convert categorical features to integers
df['R'] = df['R'].astype('category').cat.codes
df['C'] = df['C'].astype('category').cat.codes

In [4]:
#droping some features that will not need 
df.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)

In [5]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,R,C,time_step,u_in,u_out,u_in_mean_5,u_in_std_5,u_in_max_5,u_in_min_5,time_since_start,...,time_until_next_breath,time_until_end_of_breath,u_out_lag1,u_out_lag_back1,u_out_lag2,u_out_lag_back2,u_out_lag3,u_out_lag_back3,u_out_lag4,u_out_lag_back4
0,1,2,0.0,0.083334,0,0.083334,0.0,0.083334,0.083334,0.0,...,0.033652,107.568486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0.033652,18.383041,0,9.233188,12.939847,18.383041,0.083334,0.033652,...,0.067514,107.534833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2,0.067514,22.509278,0,13.658551,11.936136,22.509278,0.083334,0.101167,...,0.101542,107.467319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2,0.101542,22.808822,0,15.946119,10.766279,22.808822,0.083334,0.202709,...,0.135756,107.365777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2,0.135756,25.35585,0,17.828065,10.229525,25.35585,0.083334,0.338464,...,0.169698,107.230021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# transforming data 
rs = RobustScaler()
df = rs.fit_transform(df)

In [7]:
# read data again to make test and train for the model
train = pd.read_csv(r"C:\Users\ma086\Downloads\ventilator-pressure-prediction\train.csv")
X = df
y = train.pressure

In [8]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [9]:
# Add a constant column to the design matrix
X_train = sm.add_constant(X_train)

In [10]:
# create an OLS linear regression model with the training data
model = sm.OLS(y_train, X_train).fit()

In [11]:
#  model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               pressure   R-squared:                       0.685
Model:                            OLS   Adj. R-squared:                  0.685
Method:                 Least Squares   F-statistic:                 5.009e+05
Date:                Tue, 06 Jun 2023   Prob (F-statistic):               0.00
Time:                        14:44:11   Log-Likelihood:            -1.4167e+07
No. Observations:             4828800   AIC:                         2.833e+07
Df Residuals:                 4828778   BIC:                         2.833e+07
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         17.8751      0.032    550.806      0.0

In [12]:
# calculate the mean absolute error (MAE) between the predicted values and the true values on the test set
pred = model.predict(sm.add_constant(X_test))
mae = mean_absolute_error(pred, y_test)

In [13]:
# print the MAE
print(f"mean absolute error: {mae}")

mean absolute error: 2.9368688747512297


In [28]:
# Create a dataframe from the predicted values and add an ID column
df_pred = pd.DataFrame({'id': range(1, len(pred)+1), 'pressure': pred})

# Save the dataframe as a CSV file
df_pred.to_csv('Submission ', index=False)

In [29]:
# Create a download link for the CSV file
display(FileLink('Submission'))