In [None]:
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
from scipy import stats

import os

import numpy as np

In [None]:
data_location = os.path.normpath('../data')

data_filename = "hackathon_low_mixed_venous_oximetry.csv"
# data_filename = "hackathon_low_cardiac_output.csv"

file = os.path.join(data_location, data_filename)

In [None]:
feature_columns = [
    'Heart_Rate', 'Ejection_Fraction',
       'Radial_Artery_Mean_Pressure', 'Radial_Artery_Systolic_Pressure',
       'Radial_Artery_Diastolic_Pressure', 'Radial_Artery_DpDt',
       'Pulmonary_Artery_Mean_Pressure', 'Central_Venous_Pressure', 'gender',
       'age_years', 'bsa', 'height', 'weight'
]

if 'cardiac_output' in file:
    feature_columns += ['SvO2']
    regression_label = "RegressionLabel-CardiacIndex"
else:

    feature_columns += ['Cardiac_Output']
    regression_label = "RegressionLabel-SvO2"

In [None]:
df = (
    pd.read_csv(file, usecols=feature_columns)
   .dropna()
)
print("data size:", len(df))

In [None]:
for col in feature_columns:
    print(col)
    df[col].hist(bins=20)
    plt.show()


# Pressure visualization

In [None]:
pressure_columns = [col for col in feature_columns if "Pressure" in col]
print("Columns: ", pressure_columns)

pressure_df = pd.DataFrame(df, 
    columns = pressure_columns)

print("samples:", len(pressure_df))

pressure_df.plot()

pressure_df.corr(method='pearson')

# Outliers

In [None]:
filtered = pd.DataFrame([(np.abs(stats.zscore(pressure_df)) < 3).all(axis=1)])

filtered.plot()

filtered.corr(method='pearson')

# Correlation matrix

In [None]:
df.corr(method='pearson')