<a href="https://colab.research.google.com/github/m-pedram/kaggleCovid/blob/main/Report01/Kaggle_Covid_Basic_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ارایه‌ی یک روش بهبود یافته برای پیش‌بینی ضرورت بستری‌شدن بیماران کووید ۱۹
# در بخش مراقبت‌های ویژه با استفاده از تکنیک‌های ترکیبی داده‌کاوی


---


دانشجو:	مهنام پدرام

دانشکده:	مکانیک، برق و کامپیوتر

گروه تخصصی:	مهندسی نرم‌افزار

استاد راهنما:	خانم دکتر مریم رستگارپور     


In [None]:
# imports 
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, \
    balanced_accuracy_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, NuSVC
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA

# Reading and preprocessing the Data

Cleaning the data has been carried out manually. The already cleaned table has been uploaded in a Github Repository.

In [None]:
# read data from Google Drive
# from google.colab import drive 
# drive.mount('/content/drive')

data = pd.read_csv('https://raw.githubusercontent.com/m-pedram/kaggleCovid/main/Report03/rawData.csv')

To fill the missing values the next or the previous entries have been copied. In this regard, the Forward filling (Padding) has been favored over backfilling.

In [None]:
filledData = data.query("PATIENT_VISIT_IDENTIFIER == 0").fillna(method='pad')
filledData = filledData.query("PATIENT_VISIT_IDENTIFIER == 0").fillna(method='backfill')
for patient_identifier in range(1,255):
  tempDf = data.query("PATIENT_VISIT_IDENTIFIER == @patient_identifier").fillna(method='pad')
  tempDf2 = tempDf.query("PATIENT_VISIT_IDENTIFIER == @patient_identifier").fillna(method='backfill')
  filledData = pd.concat([filledData, tempDf2])

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp.fit(filledData)
table = imp.transform(filledData)

Remove time series information and reshape all information of each single patient into a single vector.

In [None]:
clean_table = np.concatenate((table[0,0:85], table[1,1:85], table[2,1:85], table[3,1:86]))
for patient in range(1, int((table.shape[0])/4)):
  row_number = int(patient * 4)
  clean_table = np.vstack((clean_table, np.concatenate((table[row_number,0:85], table[row_number+1,1:85], table[row_number+2,1:85], table[row_number+3,1:86]))))

# Classifiers

In [None]:
#functions

def random_imputation(df, feature):
    number_missing = df[feature].isnull().sum()
    observed_values = df.loc[df[feature].notnull(), feature]
    df.loc[df[feature].isnull(), feature + '_imp'] = np.random.choice(observed_values, number_missing, replace=True)
    return df

def knn_imputation(X):
  imputer = KNNImputer(n_neighbors=2, weights="uniform")
  return imputer.fit_transform(X)


def logistic_regression(X_tr, X_ts, y_tr, cWeight):
    clf = LogisticRegression(random_state=0, max_iter= 1000, class_weight=cWeight).fit(X_tr, y_tr.reshape(-1, ))
    return clf.predict(X_ts)


def support_vector_machine(X_tr, X_ts, y_tr,cWeight):
    # default kernel is 'RBF'
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel='rbf', class_weight=cWeight))
    clf.fit(X_tr, y_tr.reshape(-1, ))
    return clf.predict(X_ts)


def metrics_report(y_true, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  balancedAccuracy = balanced_accuracy_score(y_test, y_pred)
  precision = average_precision_score(y_test, y_pred)
  return accuracy, f1, balancedAccuracy, precision

# Train and Prediction

In [None]:
# define input-output sets
inputs = clean_table[:,1:-1]
outputs = clean_table[:,-1]
print(inputs.shape)
print(outputs.shape)

(251, 336)
(251,)


In [None]:
#split data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=.5, shuffle=False)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(125, 336)
(126, 336)
(125,)
(126,)


In [None]:
#PCA
pca = PCA(.8)
pca.fit(X_train)
train_low_dim = pca.transform(X_train)
test_low_dim = pca.transform(X_test)
train_low_dim.shape

(125, 15)

In [None]:
# train a logistic_regression classifier
# without and without balancing weights with and without PCA
print (' ---- Logistic regression experiments ----')
y_predicted = logistic_regression(X_train, X_test, y_train, 'None')
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('No balancing weights, no PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

# 
y_predicted = logistic_regression(train_low_dim, test_low_dim, y_train, 'None')
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('No balancing weights, with PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

#
y_predicted = logistic_regression(X_train, X_test, y_train, 'balanced')
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)
print('')
print('Weighted, no PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

#
y_predicted = logistic_regression(train_low_dim, test_low_dim, y_train, 'balanced')
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('Weighted, with PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

 ---- Logistic regression experiments ----

No balancing weights, no PCA
accuracy: 0.7142857142857143
f1_score: 0.14285714285714285
balanced accuracy: 0.5096409574468085
average precision: 0.25828373015873013

No balancing weights, with PCA
accuracy: 0.7222222222222222
f1_score: 0.22222222222222224
balanced accuracy: 0.5355718085106382
average precision: 0.2743818681318681

Weighted, no PCA
accuracy: 0.6904761904761905
f1_score: 0.2909090909090909
balanced accuracy: 0.5452127659574468
average precision: 0.2774327122153209

Weighted, with PCA
accuracy: 0.6746031746031746
f1_score: 0.3278688524590164
balanced accuracy: 0.555186170212766
average precision: 0.2823617952928298


In [None]:
# train a linear support vector machine (SVM) classifier
print (' ---- SVM with RBF kernel experiments ----')
y_predicted = support_vector_machine(X_train, X_test, y_train,None)
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('No balancing weights, no PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

#
y_predicted = support_vector_machine(train_low_dim, test_low_dim, y_train, None)
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('No balancing weights, with PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

#

y_predicted = support_vector_machine(X_train, X_test, y_train,'balanced')
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('Weighted, no PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)


#
y_predicted = support_vector_machine(train_low_dim, test_low_dim, y_train, 'balanced')
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('weighted, with PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

 ---- SVM with RBF kernel experiments ----

No balancing weights, no PCA
accuracy: 0.746031746031746
f1_score: 0.0
balanced accuracy: 0.5
average precision: 0.25396825396825395

No balancing weights, with PCA
accuracy: 0.7380952380952381
f1_score: 0.05714285714285714
balanced accuracy: 0.5049867021276595
average precision: 0.2564484126984127

Weighted, no PCA
accuracy: 0.6984126984126984
f1_score: 0.3666666666666667
balanced accuracy: 0.5814494680851063
average precision: 0.30171130952380953

weighted, with PCA
accuracy: 0.6746031746031746
f1_score: 0.3492063492063492
balanced accuracy: 0.5654920212765957
average precision: 0.28864247311827956


In [None]:
# train a non linear support_vector_machine classifier

print (' ---- Nonlinear SVM experiments ----')
clf = NuSVC(gamma="auto", nu=0.1, class_weight=None)
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('No balancing weights, no PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

#
clf = NuSVC(gamma="auto", nu=0.1, class_weight=None)
clf.fit(train_low_dim, y_train)
y_predicted = clf.predict(test_low_dim)
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('No balancing weights, with PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

#

clf = NuSVC(gamma="auto", nu=0.1, class_weight='balanced')
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('weighted, no PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)


#
clf = NuSVC(gamma="auto", nu=0.1, class_weight='balanced')
clf.fit(train_low_dim, y_train)
y_predicted = clf.predict(test_low_dim)
acc, f_1, balancedAcc, pr = metrics_report(y_test, y_predicted)

print('')
print('weighted, with PCA')
print ('accuracy:', acc)
print ('f1_score:', f_1)
print ('balanced accuracy:', balancedAcc)
print ('average precision:', pr)

 ---- Nonlinear SVM experiments ----

No balancing weights, no PCA
accuracy: 0.7222222222222222
f1_score: 0.3396226415094339
balanced accuracy: 0.5767952127659575
average precision: 0.3030753968253968

No balancing weights, with PCA
accuracy: 0.6904761904761905
f1_score: 0.3157894736842105
balanced accuracy: 0.5555186170212766
average precision: 0.2837896825396825

weighted, no PCA
accuracy: 0.7222222222222222
f1_score: 0.3396226415094339
balanced accuracy: 0.5767952127659575
average precision: 0.3030753968253968

weighted, with PCA
accuracy: 0.6904761904761905
f1_score: 0.3157894736842105
balanced accuracy: 0.5555186170212766
average precision: 0.2837896825396825
