# IML assignment 2
### Import libraries and data:

In [2]:
import numpy as np
import pandas as pd
import math
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LassoCV
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.covariance import EllipticEnvelope

def read_from_file(file):
    df = pd.read_csv(file, header = 0)
    return df._get_numeric_data().values

train_features_file = "train_features.csv"
train_labels_file = "train_labels.csv"
output_file = "submission.txt"

train_features = read_from_file(train_features_file)
labels = read_from_file(train_labels_file)

m = 12  # m = #hours per patient

# Reduce data size for faster testing: (remove this for entire data set)
#k = 100
#train_features = train_features[:k*12, :]
#labels = labels[:k, :]

n = labels.shape[0] # n = #patients

X_2d = train_features.reshape(n, m, -1)[:, :, 2:]  # features represented as 3D n * m * d_x array
d_x = X_2d.shape[2]  # d_x = #features per patient per hour

X = X_2d[:, :, 1:].reshape(n, m * (d_x - 1))  # features represented as 2D n * m d_x array
X = np.concatenate((X_2d[:, 0, 0].reshape(n, 1), X), axis=1)  # Add age feature seperately so that it is not repated 12 times
Y = labels[:, 1:]
d_y = Y.shape[1]


### Data impution and feature engineering:

In [3]:
# Data impution:
#imputer = KNNImputer(n_neighbors=2, weights="uniform")  # can also do weights="distance"
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')  # this is a stupid strategy but KNN is too slow?
#imputer = IterativeImputer(max_iter=2, random_state=0)
print(np.shape(X))
X_imp = imputer.fit_transform(X)
print(np.shape(X_imp))

(18995, 409)
(18995, 409)


In [4]:
# Outlier removal:
outliers_fraction = 0.2
clf = EllipticEnvelope(contamination=outliers_fraction)
clf.fit(X_imp)
#scores_pred = clf.decision_function(X_imp)
y_pred = clf.predict(X_imp)
X_inliers = X_imp[y_pred==1]
Y_inliers = Y[y_pred==1]
print(np.shape(X_inliers))



(15196, 409)


In [5]:
# Normalization:



In [6]:
# Fix imbalanced data? SVM seems to do it automatically

# Feature engineering????

In [7]:
# Load test file
test_features_file = "test_features.csv"
test_features = read_from_file(test_features_file)
n_test = test_features.shape[0] // m
X_2d_test = test_features.reshape(n_test, m, -1)[:, :, 2:]
X_test = X_2d_test[:, :, 1:].reshape(n_test, m * (d_x - 1))
X_test = np.concatenate((X_2d_test[:, 0, 0].reshape(n_test, 1), X_test), axis=1)

In [8]:
#imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
#imputer = KNNImputer(n_neighbors=2, weights="uniform")
X_test_imp = imputer.transform(X_test)


print(np.shape(X_test_imp))


(12664, 409)




### Subtask 1:
Predict whether medical tests are ordered by a clinician in the remainder of the hospital stay: 0 means that there will be no further tests of this kind ordered, 1 means that at least one of a test of that kind will be ordered. In the submission file, you are asked to submit predictions in the interval [0, 1], i.e., the predictions are not restricted to binary. 0.0 indicates you are certain this test will not be ordered, 1.0 indicates you are sure it will be ordered. The corresponding columns containing the binary groundtruth in train_labels.csv are: LABEL_BaseExcess, LABEL_Fibrinogen, LABEL_AST, LABEL_Alkalinephos, LABEL_Bilirubin_total, LABEL_Lactate, LABEL_TroponinI, LABEL_SaO2, LABEL_Bilirubin_direct, LABEL_EtCO2.


In [24]:

# Issue: some patients do not start at hour 1, but 2, 3, 4 etc???

# metrics.classification_report for classification metrics

# NEED TO DO NORMALIZATION SINCE SVM IS NOT SCALE INVARIANT!!!

C = 1.0  # Regularization parameter (inversely proportional, not lambda)
clf = svm.SVC(C=C, kernel='rbf', gamma='scale', probability=False, class_weight='balanced')  # The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data
clf.fit(X_inliers, Y_inliers[:, 8])
solution = clf.decision_function(X_test_imp)
solution = (1 + np.exp(-solution))**-1
np.savetxt(output_file, solution, fmt='%.10f')

### Subtask 2:
Predict whether sepsis will occur in the remaining stay: 0 means that no sepsis will occur, 1 otherwise. Similar to Subtask 1, you are asked to produce predictions in the interval [0, 1] for this task. The corresponding column containing the binary groundtruth in train_labels.csv is LABEL_Sepsis.

### Subtask 3:
Predict future mean values of key vital signs. The corresponding columns containing the real-valued groundtruth in train_labels.csv are LABEL_RRate, LABEL_ABPm, LABEL_SpO2, LABEL_Heartrate.