# Current Assumption
1. Drop time
2. Need to double check if mean is calculated right

# Setup

https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html

In [1]:
import numpy as np
from decimal import *
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

# Preprocess


In [2]:
# seperate columns into two groups 
#   1. fr_column: use most frequent value
#   2. mediam_column: use mediam
def preprocess(df, filename):
  Patient_id = []
  df = df.drop(columns=['admission_datetime'])
  column_list = df.columns.tolist()
  fr_column = []
  mediam_column = []
  for c in column_list:
    if df[c].isnull().values.any():
      if (c.startswith('pmhx_')):
        fr_column.append(c)
      else:
        mediam_column.append(c)

  # Turn the list to dictionary
  fr_dictionary = dict.fromkeys(fr_column, -1)
  mediam_dictionary = dict.fromkeys(mediam_column, -1)

  for c in fr_column:
    max_value = max_fr = 0
    counts = df[c].value_counts().to_dict()    
    for value in counts:
      if counts[value] > max_fr:
        max_fr = counts[value]
        max_value = value
    fr_dictionary[c] = max_value
    df[c] = df[c].replace(np.nan, fr_dictionary[c])

  for c in mediam_column:
    mediam = df[c].median(axis = 0, skipna = True) 
    mediam_dictionary[c] = round(mediam, 2)
    df[c] = df[c].replace(np.nan, mediam_dictionary[c])

  df_name = filename + '_Filled.csv'
  df.to_csv(path + df_name, index = False)
  
  # one hot vector for sex and ed_diagnosis
  df = pd.read_csv(path + df_name)
  data_dum = pd.get_dummies(df, prefix=['s', 'd'], columns=['sex', 'ed_diagnosis'])
  df = pd.DataFrame(data_dum)
  
  if filename == '\\Train':
    df = df.drop(columns=['PATIENT ID', 'hospital_outcome']) # drop attr['hospital_outcome'] in X
  elif filename == '\\fixed_test':
    Patient_id = df['PATIENT ID'].tolist()
    df = df.drop(columns=['PATIENT ID'])
  df_name = filename + '_Onehot.csv'
  df.to_csv(path + df_name, index = False)  
  return Patient_id

In [3]:
import os 
path = os.getcwd()
print(path)

C:\Users\Mimi\Desktop\107062274_郭冠廷


In [4]:
attr = pd.read_csv(path + '\\hm_hospitales_covid_structured_30d_train.csv')
label = pd.read_csv(path + '\\split_train_export_30d.csv')

# merge two csv files using patient ID
attr = pd.merge(attr, label, on='PATIENT ID')

# save df['hospital_outcome'] as label
label = attr['hospital_outcome']
attr.to_csv(path + '\\Train_Joined.csv', index = False)
preprocess(attr, '\\Train')

[]

## This model

In [6]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold, KFold
features = pd.read_csv(path + '\\Train_Onehot.csv')
skf = StratifiedKFold(n_splits=10)
scaler = StandardScaler()
features = scaler.fit_transform(features)
print(features.shape)
svc = SVC(class_weight='balanced', C=7, gamma=0.0001, kernel='sigmoid')
model = Pipeline([('scaler', StandardScaler()), ('svc', svc)])
scores = cross_validate(svc, features, label, cv=skf, scoring=['recall', 'precision', 'f1'], n_jobs=-1) # n_job=-1 means use every core
xfinal = pd.read_csv(path + '\\fixed_test.csv')
Patient_id = preprocess(xfinal, '\\fixed_test')
svc.fit(features, label)
xfinal_final = pd.read_csv(path + '\\fixed_test_Onehot.csv')
xfinal_final = scaler.transform(xfinal_final)
yfinal = svc.predict(xfinal_final)
df= pd.DataFrame(columns = ['PATIENT ID', 'hospital_outcome'])
df['PATIENT ID'] = Patient_id
df['hospital_outcome'] = yfinal
df.to_csv(path + '\\107062274.csv', index = False)

(1834, 51)
