In [1]:
import pandas as pd
import datetime as dt
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from scipy.stats import zscore
from scipy import stats
import plotly.express as px
import numpy as np
import operator
from sklearn.preprocessing import normalize
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import umap.umap_ as umap
import random
import itertools
import os 
from scipy import interpolate

In [14]:
df_test = pd.read_csv('./data/Daten_SAB_COPRA_2014.csv', error_bad_lines=False, warn_bad_lines=False, sep = ';', parse_dates=['Zeitstempel'])
len(df_test)

454378

In [13]:
def dataframe_znorm_differenz(dataframe):
    
    # identifier 'T' for Vitalwert and Labor --> change 'T' to 'Temperatur' for Labor
    # then delete these values (don't make sense!)
    dataframe.loc[(dataframe.Wertbezeichner=='T')&(dataframe.Kategorie=='Labor'), 'Wertbezeichner'] = 'Temperatur'
    dataframe = dataframe[dataframe.Wertbezeichner!='Temperatur']
    # apply z-normalization
    dataframe['Wert_norm'] = None
    dataframe.loc[dataframe.Kategorie=='Vitalwert', 'Wert_norm'] = dataframe[dataframe['Kategorie']=='Vitalwert'].groupby(['Wertbezeichner'])['Wert'].transform(lambda x : zscore(x,ddof=0))
    dataframe.loc[dataframe.Kategorie=='Labor', 'Wert_norm'] = dataframe[dataframe['Kategorie']=='Labor'].groupby(['Wertbezeichner'])['Laborwert'].transform(lambda x : zscore(x,ddof=0)) 
    temp = dataframe[dataframe['Wertbezeichner']=='RR']['Mean']
    dataframe.loc[dataframe.Wertbezeichner=='RR', 'Wert_norm'] = (temp - temp.mean())/temp.std(ddof=0)
    
    # get difference of 'Einfuhr' and 'Ausfuhr'
    df_ein_temp = dataframe[dataframe['Wertbezeichner']=='Einfuhr']
    df_ein = df_ein_temp['Wert'].astype(int)
    df_aus_temp = dataframe[dataframe['Wertbezeichner']=='Ausfuhr']
    df_aus = df_aus_temp['Wert'].astype(int)
    
    df_diff = df_ein.values - df_aus.values
    dataframe['Differenz'] = None
    dataframe.loc[dataframe.Wertbezeichner=='Einfuhr', 'Differenz'] = df_diff
    dataframe.loc[dataframe.Wertbezeichner=='Ausfuhr', 'Differenz'] = -df_diff
    dataframe['Differenz'] = dataframe['Differenz'].astype(float)
    dataframe['Wert_norm'] = dataframe['Wert_norm'].astype(float)

    return dataframe

In [14]:
# there might be missing values (e.g. 2018)
def check_bilanz(dataframe_list):
    for df_index in range(len(dataframe_list)):
        not_one_list = True
        while not_one_list:
            dataframe = dataframe_list[df_index]
            df_ein_temp = dataframe[dataframe['Wertbezeichner']=='Einfuhr']
            df_ein = df_ein_temp['Wert'].astype(int)
            df_aus_temp = dataframe[dataframe['Wertbezeichner']=='Ausfuhr']
            df_aus = df_aus_temp['Wert']
            diff_index = [x1 - x2 for (x1, x2) in zip(list(df_ein.index), list(df_aus.index))]
            not_one = [i for i in diff_index if i != 1]
            if not_one != []:
                first_difference = diff_index.index(not_one[0])
                i = list(df_ein.index)[first_difference]
                dataframe = dataframe.drop(i)
                dataframe_list[df_index] = dataframe
            else:
                not_one_list = False
    return dataframe_list

In [15]:
def get_cluster_matrix(df, cat='Vitalwert', days_interval=5, test_pat=34035669, hour=False, nan_to_zero=True):
    x = None
    # list of all the patient numbers in the given file df
    patient_list = sorted(df.FallNr.unique())
    # list of all the identifiers of category 'cat' in the given file df (sorted alphabetically)
    ident_list = sorted(df[df.Kategorie==cat].Wertbezeichner.unique()) 
    if hour == True:
        X = np.zeros((len(patient_list),len(ident_list)*days_interval*24))
        hour_list = list(range(0,24))
    else:
        X = np.zeros((len(patient_list),len(ident_list)*days_interval))   
    for pat in patient_list:
        if list(patient_list).index(pat) % 50 == 0:
            print(list(patient_list).index(pat), ' of ', len(patient_list))
        df_pat = df[(df.FallNr==pat)&(df.Kategorie==cat)]
        # get dataframe with date and identifier as multiindex and mean over values of corresponding column values
        if cat=='Vitalwert' and hour==True:
            interval_length = days_interval*24
            # array where the data will be saved in (what we actually want)
            df_pat_temp = df_pat.groupby([df_pat['Wertbezeichner'], df_pat['Zeitstempel'].dt.date, df_pat['Zeitstempel'].dt.hour]).mean()
            
            if pat == test_pat:
                x = df_pat_temp
            # date of first measurement
            first_date = list(df_pat['Zeitstempel'].dt.date)[0] # frühestes Datum
            # define 'maximal date' of interval of days we want to look at
            last_date = first_date + dt.timedelta(days=days_interval-1)
            # get list with all dates between first_date and last_date (even if not in df!)
            # date_list = [first_date, first_date+1, first_date+2, first_date+3, last_date]
            # even if e.g. at first_date+1 nothing was measured
            date_list = [first_date + dt.timedelta(days=x) for x in range(days_interval)]
            pat_ident = df_pat_temp.index.get_level_values(0).unique()
           # all identifiers
            if sorted(pat_ident) != sorted(ident_list):
                diff = list(set(pat_ident).symmetric_difference(set(ident_list)))
                for elem in diff:
                    temp = list(itertools.chain.from_iterable(itertools.repeat(x, 24) for x in date_list))

                    index = pd.MultiIndex.from_tuples(list(zip([elem]*days_interval*24, temp, hour_list*len(date_list))), names=["Wertbezeichner", "Zeitstempel", "Stunden"])
                    s = pd.DataFrame(np.zeros((days_interval*24, df_pat_temp.shape[1])), index=index) 
                    s[0] = np.array([pat]*days_interval*24)
                    s = s.replace({0:np.nan})
                    for da in date_list:
                        for h in hour_list:
                            df_pat_temp.loc[(elem, da, h), :] = s
                df_pat_temp = df_pat_temp.sort_index()
            value_list = []
            for i in ident_list:
                df_temp = df_pat_temp.loc[i].loc[first_date:last_date]
                index_list_dates = []

                for j in range(len(df_temp.index)):
                    index_list_dates.append(df_temp.index[j][0])
                index_list_dates = list(set(index_list_dates))

                # all dates
                if list(sorted(index_list_dates)) != sorted(date_list):
                    temp = np.zeros(df_temp.shape[1])
                    temp[0] = np.array([pat])
                    diff = list(set(list(index_list_dates)).symmetric_difference(set(date_list)))
                    for elem in diff:
                        for h in hour_list:
                            df_pat_temp.loc[(i, elem, h), :] = temp
                    df_pat_temp = df_pat_temp.sort_index()
                # all hours
                for day in sorted(date_list):
                    index_list_hours = list(df_pat_temp.loc[i].loc[day].index)
                    if list(sorted(index_list_hours)) != sorted(hour_list):
                        temp = np.zeros(df_temp.shape[1])
                        temp[0] = np.array([pat])
                        temp[temp==0] = np.nan
                        diff = list(set(list(index_list_hours)).symmetric_difference(set(hour_list)))
                        for h in diff:
                            df_pat_temp.loc[(i, day, h), :] = temp
                        df_pat_temp = df_pat_temp.sort_index()
                df_temp = df_pat_temp.loc[i][first_date:last_date]['Wert_norm']
                value_list.extend(df_temp)
        else:
            interval_length = days_interval

            # array where the data will be saved in (what we actually want)
            df_pat_temp = df_pat.groupby([df_pat['Wertbezeichner'], df_pat['Zeitstempel'].dt.date]).mean()
            if pat == test_pat:
                x = df_pat_temp
            # date of first measurement
            first_date = list(df_pat['Zeitstempel'].dt.date)[0] # frühestes Datum
            # define 'maximal date' of interval of days we want to look at
            last_date = first_date + dt.timedelta(days=days_interval-1)
            # get list with all dates between first_date and last_date (even if not in df!)
            # date_list = [first_date, first_date+1, first_date+2, first_date+3, last_date]
            # even if e.g. at first_date+1 nothing was measured
            date_list = [first_date + dt.timedelta(days=x) for x in range(days_interval)]
            pat_ident = df_pat_temp.index.get_level_values(0).unique()

            # all identifiers
            if sorted(pat_ident) != sorted(ident_list):
                diff = list(set(pat_ident).symmetric_difference(set(ident_list)))
                for elem in diff:
                    index = pd.MultiIndex.from_tuples(list(zip([elem]*days_interval, date_list)), names=["Wertbezeichner", "Zeitstempel"])
                    s = pd.DataFrame(np.zeros((days_interval, df_pat_temp.shape[1])), index=index) 
                    s[0] = np.array([pat]*days_interval)
                    s = s.replace({0:np.nan})
                    for da in date_list:
                        df_pat_temp.loc[(elem, da), :] = s
                df_pat_temp = df_pat_temp.sort_index()
            value_list = []
            for i in ident_list:
                df_temp = df_pat_temp.loc[i].loc[first_date:last_date]
                # all dates
                if list(sorted(df_temp.index)) != sorted(date_list):
                    temp = np.zeros(df_temp.shape[1])
                    temp[0] = np.array([pat])
                    temp[temp==0] = np.nan
                    diff = list(set(list(df_temp.index)).symmetric_difference(set(date_list)))
                    for elem in diff:
                        df_pat_temp.loc[(i, elem), :] = temp
                    df_pat_temp = df_pat_temp.sort_index()
                    df_pat_temp = df_pat_temp.sort_index()
                
                df_temp = df_pat_temp.loc[i][first_date:last_date]['Wert_norm']
                value_list.extend(df_temp)
            
        idx = list(patient_list).index(pat)
        X[idx] = np.array(value_list)
        for j in range(len(ident_list)):
            y = X[idx][j*interval_length:j*interval_length+interval_length]
            if not np.isnan(y).all():
                nans, y_lambda = helper(y)
                y[nans] = np.interp(y_lambda(nans), y_lambda(~nans), y[~nans])
            
    if nan_to_zero == True:
        # replace all nans with zero!!!!!
        X[np.isnan(X)] = 0
    
    
    return X, x, ident_list

In [16]:
def remove_outliers(df):
    print(df.shape)
    # Temperatur: max 50°C, ging bis 3582°C
    df = df.drop(df[(df.Wertbezeichner == 'T') & (df.Wert > 50)].index)
    print(df.shape)
    # RR: Mean: min=0, max=755, danach alles < 300. Ich droppe den einen extremen Ausreißer
    df = df.drop(df[(df.Mean > 750)].index)
    print(df.shape)
    
    # AF: zu wenig Ahnung. min=0, max=155, aber 30 gilt als erhöht, hier gibt es Werte 100+
    # HF: zu wenig Ahnung. min=0, max=270
    # HF_Pulsoxy: zu wenig Ahnung. min=22, max=300
    # ICP: zu wenig Ahnung, min=-40, max=159
    # SPO2: min = 5, max=100
    return df

In [17]:
def save_file(file, filename):
    name = filename + '_' + str(file.shape[0]) + '_'  + str(file.shape[1]) + '.txt'
    a_file = open(name, 'w')
    for row in file:
        np.savetxt(a_file, row)

    a_file.close()

In [18]:
def helper(arr):
    return np.isnan(arr), lambda a: a.nonzero()[0]

In [19]:
path='./data/'
df_list = []
for file in os.listdir(path):
    if '_modified_80' in file and '#' not in file and 'subset' not in file:
        print(file)
        with open(os.path.join(path, file), 'r+') as f:
            df_temp = pd.read_csv(f, error_bad_lines=False, warn_bad_lines=False, sep = ';', parse_dates=['Zeitstempel'])
            print(len(df_temp.FallNr.unique()))
            df_list.append(df_temp)
            
print(len(df_list))

Daten_SAB_COPRA_2016_modified_80.csv
60
Daten_SAB_COPRA_2015_modified_80.csv
50
Daten_SAB_COPRA_2018_modified_80.csv
45
Daten_SAB_COPRA_2017_modified_80.csv
48
Daten_SAB_COPRA_2014_modified_80.csv
50
Daten_SAB_COPRA_2019_modified_80.csv
41
6


In [20]:
df_list = check_bilanz(df_list)
df = pd.concat(df_list)  
df = df.reset_index(drop=True)
df = remove_outliers(df)
df = dataframe_znorm_differenz(df)

(1625833, 10)
(1625827, 10)
(1625826, 10)


In [21]:
df.to_csv(path+'Daten_COPRA_all_subset_80_interpolation.csv', sep=';')

In [22]:
path='./data/'
df_list = []
for file in os.listdir(path):
    if ('_modified_20' in file) and ('#' not in file) and ('80' not in file):
        print(file)
        with open(os.path.join(path, file), 'r+') as f:
            df_temp = pd.read_csv(f, error_bad_lines=False, warn_bad_lines=False, sep = ';', parse_dates=['Zeitstempel'])
            print(len(df_temp.FallNr.unique()))
            df_list.append(df_temp)

Daten_SAB_COPRA_2015_modified_20.csv
67
Daten_SAB_COPRA_2019_modified_20.csv
59
Daten_SAB_COPRA_2018_modified_20.csv
67
Daten_SAB_COPRA_2014_modified_20.csv
68
Daten_SAB_COPRA_2017_modified_20.csv
66
Daten_SAB_COPRA_2016_modified_20.csv
90


In [23]:
len(df)

1607002

In [24]:
pat_list = df.FallNr.unique()
len(pat_list)

294

In [25]:
days_interval = 5
test_pat = random.choice(sorted(df.FallNr.unique()))

# Vitalwerte

all Patients, mean over day

In [26]:
X_vital_all_inter, x_vital_all_inter, vital_list_all_inter = get_cluster_matrix(df, days_interval=days_interval, test_pat=test_pat, hour=False)
save_file(X_vital_all_inter, 'X_vital_all_inter')

0  of  294
50  of  294
100  of  294
150  of  294
200  of  294
250  of  294


In [27]:
print('first dimension should be:', len(pat_list), '(= number of feasible patients)')
print('second dimension should be:', days_interval * len(df[df.Kategorie=='Vitalwert']['Wertbezeichner'].unique()), '(= days_interval * number of identifiers for category Vitalwert)')
print('actual dimensuons are', X_vital_all_inter.shape)

first dimension should be: 294 (= number of feasible patients)
second dimension should be: 35 (= days_interval * number of identifiers for category Vitalwert)
actual dimensuons are (294, 35)


all patients, mean over hours

In [28]:
X_vital_all_hour_inter, x_vital_all_hour_inter, vital_list_all_hour_inter = get_cluster_matrix(df, days_interval=days_interval, test_pat=test_pat, hour=True)
save_file(X_vital_all_hour_inter, 'X_vital_all_hour_inter')

0  of  294
50  of  294
100  of  294
150  of  294
200  of  294
250  of  294


In [29]:
print('first dimension should be:', len(pat_list), '(= number of feasible patients)')
print('second dimension should be:', days_interval * len(df[df.Kategorie=='Vitalwert']['Wertbezeichner'].unique()) * 24, '(= days_interval * number of identifiers for category Vitalwert * hours in a day)')
print('actual dimensuons are', X_vital_all_hour_inter.shape)

first dimension should be: 294 (= number of feasible patients)
second dimension should be: 840 (= days_interval * number of identifiers for category Vitalwert * hours in a day)
actual dimensuons are (294, 840)


# Laborwerte

all patients, mean over day

In [30]:
X_labor_all_inter, x_labor_all_inter, labor_list_all_inter = get_cluster_matrix(df, cat='Labor', days_interval=days_interval, test_pat=test_pat, hour=False)
save_file(X_labor_all_inter, 'X_labor_all_inter')

0  of  294
50  of  294
100  of  294
150  of  294
200  of  294
250  of  294


In [31]:
print('first dimension should be:', len(pat_list), '(= number of feasible patients)')
print('second dimension should be:', days_interval * len(df[df.Kategorie=='Labor']['Wertbezeichner'].unique()), '(= days_interval * number of identifiers for category Labor)')
print('actual dimensuons are', X_labor_all_inter.shape)

first dimension should be: 294 (= number of feasible patients)
second dimension should be: 1910 (= days_interval * number of identifiers for category Labor)
actual dimensuons are (294, 1910)


all patients, mean over hours not very meaningful

#### Stichprobentest

In [None]:
index = list(set(map(operator.itemgetter(0), x_vital_all[x_vital_all.FallNr==test_pat].index.tolist())))
index = sorted(index)
print(index)
# note: some identifier might be missing (e.g. ICP for patient index)
for i in index:
    df_subset = x_vital_all[x_vital_all.FallNr==test_pat].loc[i]['Wert_norm'][:days_interval] #  = 5 
    print(df_subset)

In [None]:
i = sorted(df.FallNr.unique()).index(test_pat)
X_vital_all[i]