In [None]:
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt

In [None]:
import os

In [None]:
data_location = os.path.normpath('../data')

In [None]:
file = os.path.join(data_location, "hackathon_low_mixed_venous_oximetry.csv")
# file = os.path.join(data_location, "hackathon_low_cardiac_output.csv")

In [None]:
feature_columns = [col for col in df.columns if col not in ["event_count",
                                                            "ClassificationLabel",
                                                           "subject_id"]  and "Regression" not in col]

if 'cardiac_output' in file:
    regression_label = "RegressionLabel-CardiacIndex"
else:
    regression_label = "RegressionLabel-SvO2"

# investigations

In [None]:
df = (
    pd.read_csv(file)
)
df.groupby("subject_id").count().event_count.value_counts()
df.isna().sum(axis=1).value_counts()

# Preprocessing

In [None]:
df = (
    pd.read_csv(file)
)

df = (
    df
    .assign(ClassificationLabel = df.ClassificationLabel=="Positive")
    .assign(gender=df.gender=="F")
    .loc[lambda df: ~df.gender]
)


means = df[feature_columns+["subject_id"]].groupby("subject_id").mean()

to_drop = []
for row_id, row in df.iterrows():
    for col, elem in row.items():
        if pd.isna(elem):
            new = means.loc[row.subject_id, col]
            if pd.isna(new):
                to_drop.append(row_id)
            else:    
                df.loc[row_id, col] = new

df = df.loc[lambda df: ~df.index.isin(to_drop)]
                
# for col in feature_columns:
#     df[col].fillna(df[col].mean(), inplace=True)

In [None]:
import sklearn.covariance
env = sklearn.covariance.EllipticEnvelope()

env.fit(df[feature_columns].values)

outliers = env.predict(df[feature_columns].values)
sum(outliers==-1)
df = df.loc[outliers==1]


len(df)

In [None]:
for col in feature_columns:
    print(col)
    if col != 'gender':
        df[col].hist(bins=20)
        plt.show()

# Train test split

In [None]:
subjects = list(set(df.subject_id))
N = int(len(subjects)/3*2)
train_subjects = subjects[:N]
test_subjects = subjects[N:]



df_train=df.loc[df.subject_id.isin(train_subjects)]
df_test=df.loc[df.subject_id.isin(test_subjects)]

print(len(df_test))
print(len(df_train))

# Classification

In [None]:
from sklearn.model_selection import train_test_split
X = df_train[feature_columns].values
y = df_train.ClassificationLabel

X_test = df_test[feature_columns].values
y_test = df_test.ClassificationLabel

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=2)
clf = sklearn.ensemble.GradientBoostingClassifier()
clf = clf.fit(X,y)

In [None]:
# fig = plt.figure(figsize=(25,20))
# _ = tree.plot_tree(clf)

In [None]:
y_test_predict = clf.predict(X_test)

In [None]:
import sklearn.metrics as metrics
conf_matrix = metrics.confusion_matrix(y_test, y_test_predict)
pd.DataFrame(conf_matrix,
            index=[f"label_{i}" for i in range(0, max(y_test)+1)],
            columns=[f"pred_{i}" for i in range(0, max(y_test)+1)])

# Regression

In [None]:
from sklearn.model_selection import train_test_split
X = df_train[feature_columns].values
y = df_train[regression_label]

X_test = df_test[feature_columns].values
y_test = df_test[regression_label]

In [None]:
clf = sklearn.linear_model.Lasso(alpha=0.1, fit_intercept=True,
                                 normalize=True, precompute=False, copy_X=True,
                                 max_iter=1000, tol=0.0001, warm_start=False,
                                 positive=False, random_state=True, selection='cyclic')

# clf = sklearn.linear_model.LinearRegression()
# from sklearn.ensemble import GradientBoostingRegressor
# clf = GradientBoostingRegressor()


# clf = sklearn.svm.SVR()

In [None]:
clf.fit(X,y)
y_pred = clf.predict(X)

In [None]:
clf.fit(X,y)
y_pred = clf.predict(X_test)

In [None]:
sklearn.metrics.mean_squared_error(y_test, y_pred)

In [None]:
plt.scatter(y_test, y_pred)

# Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import sklearn.manifold

In [None]:
pca = PCA(n_components=3)
pca = sklearn.manifold.TSNE(n_components=3)

scaler = MinMaxScaler()
scaler.fit(X)
X_normalized = scaler.transform(X)

pca.fit(X_normalized)

In [None]:
# import numpy as np
# np.argmax(pca.components_[0])

In [None]:
X_pca = pca.fit_transform(X_normalized)

xx = X_pca[:,1]
yy = X_pca[:,2]
plt.scatter(xx,yy)

In [None]:
xx = X_pca[:,0]
yy = X_pca[:,1]
plt.scatter(xx,yy)