In [1]:
from collections import Counter
from imblearn.over_sampling import ADASYN, BorderlineSMOTE, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from matplotlib import pyplot
import pandas as pd
from numpy import where, mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('corona_blood.csv', index_col=None)
df.head(5)
df = df.fillna(0)

Unnamed: 0,Patient.age.quantile,SARS.Cov.2.exam.result,Hematocrit,Hemoglobin,Platelets,Mean.platelet.volume,Red.blood.Cells,Lymphocytes,Mean.corpuscular.hemoglobin.concentration..MCHC.,Leukocytes,Basophils,Mean.corpuscular.hemoglobin..MCH.,Eosinophils,Mean.corpuscular.volume..MCV.,Monocytes,Red.blood.cell.distribution.width..RDW.
0,2,1,0.991838,0.792188,-0.341548,1.469188,1.653476,-0.048383,-0.452899,-0.420197,1.303529,-1.442245,-0.498393,-1.396114,1.933339,0.967144
1,15,1,-0.495919,-0.398276,-0.718402,-0.438097,-0.56795,-0.935404,0.244149,-0.820919,-1.140144,0.334989,-0.66695,0.22628,-0.456613,-0.978899
2,14,1,-0.312811,-0.6489,-0.027502,-0.101517,-0.656101,-0.099557,-1.448681,-0.968407,-0.529226,0.021361,0.175837,0.807138,1.513128,0.347948
3,14,1,-0.518807,-0.272964,-0.21593,0.459449,-0.515058,-0.457777,0.941197,-0.57325,-0.223767,0.439533,-0.70909,0.066045,2.537393,-0.801985
4,7,1,0.694287,0.729532,-0.743526,0.235063,0.595655,-0.636887,0.343725,-0.606644,-0.223767,0.125903,-0.119138,-0.014074,0.882811,-0.713529


In [5]:
# Exemplo do Uso de SMOTE para oversampling

X = df.drop(columns=["SARS.Cov.2.exam.result"])
y = df['SARS.Cov.2.exam.result'].values
#X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
#n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
counter = Counter(y)
print(counter)
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
counter = Counter(y)
print(counter)


Counter({0: 517, 1: 81})
Counter({1: 517, 0: 517})


In [7]:
# Exemplo de SMOTE usado para oversampling seguido por subsampling, como proposto no artigo 
# (SMOTE: Synthetic Minority Over-sampling Technique, 2002.)
X = df.drop(columns=["SARS.Cov.2.exam.result"])
y = df['SARS.Cov.2.exam.result'].values
counter = Counter(y)
print(counter)
over = SMOTE(sampling_strategy=0.2) 
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X, y = pipeline.fit_resample(X, y)
counter = Counter(y)
print(counter)


Counter({0: 517, 1: 81})
Counter({0: 206, 1: 103})


In [10]:
# Utilização de Smote para oversampling junto à Classificação por Árvores de Decisão
# define dataset
X = df.drop(columns=["SARS.Cov.2.exam.result"])
y = df['SARS.Cov.2.exam.result'].values
steps = [('over', SMOTE()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.692


In [18]:
# Utilização de Smote para oversampling junto à Classificação por Árvores de Decisão com subsampling

X = df.drop(columns=["SARS.Cov.2.exam.result"])
y = df['SARS.Cov.2.exam.result'].values
model = DecisionTreeClassifier()
over = SMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.679


In [20]:
#Análise de KNN e influência na curva ROC

X = df.drop(columns=["SARS.Cov.2.exam.result"])
y = df['SARS.Cov.2.exam.result'].values

k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    model = DecisionTreeClassifier()
    over = SMOTE(sampling_strategy=0.5, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.657
> k=2, Mean ROC AUC: 0.687
> k=3, Mean ROC AUC: 0.662
> k=4, Mean ROC AUC: 0.641
> k=5, Mean ROC AUC: 0.702
> k=6, Mean ROC AUC: 0.683
> k=7, Mean ROC AUC: 0.689


In [27]:
X = df.drop(columns=["SARS.Cov.2.exam.result"])
y = df['SARS.Cov.2.exam.result'].values
model = DecisionTreeClassifier()
over = BorderlineSMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))


Mean ROC AUC: 0.699


In [25]:
# Análise com Adaptive Synthetic Sampling para criação de novos dados

X = df.drop(columns=["SARS.Cov.2.exam.result"])
y = df['SARS.Cov.2.exam.result'].values
model = DecisionTreeClassifier()
over = ADASYN(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.675
