In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import shuffle
import pandas as pd

In [None]:
xlsx_path = '/content/drive/MyDrive/blood_data/cbc-dataset-refined.xlsx'

dataset = pd.read_excel(xlsx_path)

In [None]:
dataset

Unnamed: 0,ID,WBC,LYMp,MIDp,NEUTp,LYMn,MIDn,NEUTn,RBC,HGB,...,MCHC,RDWSD,RDWCV,PLT,MPV,PDW,PCT,PLCR,deviation,pneumonia
0,1,10.0,43.2,6.7,50.1,4.3,0.7,5.0,2.77,7.3,...,30.1,35.3,11.4,189.0,9.2,12.5,0.17,22.30,1,0
1,2,10.0,42.4,5.3,52.3,4.2,0.5,5.3,2.84,7.3,...,20.2,35.3,11.4,180.0,8.9,12.5,0.16,19.50,1,0
2,3,7.2,30.7,8.6,60.7,2.2,0.6,4.4,3.97,9.0,...,29.5,37.2,13.7,148.0,10.1,14.3,0.14,30.50,1,0
3,4,6.0,30.2,6.3,63.5,1.8,0.4,3.8,4.22,3.8,...,29.8,46.5,17.0,143.0,8.6,11.3,0.12,16.40,1,0
4,5,4.2,39.1,7.2,53.7,1.6,0.3,2.3,3.93,0.4,...,29.7,42.7,15.1,236.0,19.5,12.8,0.22,24.80,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,2.7,43.4,7.1,49.5,1.2,0.2,1.3,4.77,13.2,...,31.7,37.2,12.2,169.0,10.1,14.3,0.17,28.60,1,0
496,497,6.2,35.0,6.9,57.6,2.2,0.4,3.6,4.82,11.6,...,31.4,37.2,13.8,177.0,9.9,13.6,0.18,29.00,1,0
497,498,8.4,29.2,7.3,63.5,2.0,0.5,4.3,4.40,9.9,...,33.0,38.0,10.6,133.1,9.6,12.8,0.12,24.60,1,0
498,499,7.4,19.0,8.5,72.5,0.8,0.3,2.9,3.34,7.4,...,30.9,36.2,11.0,125.0,10.7,15.9,0.13,33.60,1,0


In [None]:
data = shuffle(dataset)
X = data.drop('pneumonia', axis='columns')
y = data['pneumonia']

In [None]:
svc_scores = cross_val_score(SVC(), X, y)
print(svc_scores, svc_scores.mean())

[0.86 0.86 0.86 0.85 0.85] 0.8560000000000001


In [None]:
log_scores = cross_val_score(LogisticRegression(max_iter=10000), X, y)
print(log_scores, log_scores.mean())

[0.91 0.94 0.93 0.9  0.89] 0.914


In [None]:
sgd_scores = cross_val_score(SGDClassifier(), X, y)
print(sgd_scores, sgd_scores.mean())

[0.87 0.9  0.91 0.87 0.85] 0.8800000000000001


In [None]:
rf_scores = cross_val_score(RandomForestClassifier(), X, y)
print(rf_scores, rf_scores.mean())

[0.97 0.96 0.99 0.96 0.97] 0.97


In [None]:
print('SVC score: ' + str(svc_scores.mean()))
print('Log score: ' + str(log_scores.mean()))
print('SGD score: ' + str(sgd_scores.mean()))
print('RF score: ' + str(rf_scores.mean()))

SVC score: 0.8560000000000001
Log score: 0.914
SGD score: 0.8800000000000001
RF score: 0.97


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_model.score(X_test, y_test)

0.98

In [None]:
log_model = LogisticRegression(max_iter=10000)
log_model.fit(X_train, y_train)
log_model.score(X_test, y_test)

0.93

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
knn_model.score(X_test, y_test)

0.9

In [None]:
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
sgd_model.score(X_test, y_test)

0.85

In [None]:
dt_model = DecisionTreeClassifier()

dt_model.fit(X_train, y_train)
dt_model.score(X_test, y_test)

0.98

In [None]:
y_pred_rf = rf_model.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)

In [None]:
print("Random forest Accuracy:", rf_accuracy)
print("Random forest Precision:", rf_precision)
print("Random forest Recall:", rf_recall)
print("Random forest F1 Score:", rf_f1)

Random forest Accuracy: 0.98
Random forest Precision: 0.9285714285714286
Random forest Recall: 0.9285714285714286
Random forest F1 Score: 0.9285714285714286


In [None]:
y_pred_log = log_model.predict(X_test)

log_accuracy = accuracy_score(y_test, y_pred_log)
log_precision = precision_score(y_test, y_pred_log)
log_recall = recall_score(y_test, y_pred_log)
log_f1 = f1_score(y_test, y_pred_log)

In [None]:
print("Logistic Regression Accuracy:", log_accuracy)
print("Logistic Regression Precision:", log_precision)
print("Logistic Regression Recall:", log_recall)
print("Logistic Regression F1 Score:", log_f1)

Logistic Regression Accuracy: 0.93
Logistic Regression Precision: 0.7692307692307693
Logistic Regression Recall: 0.7142857142857143
Logistic Regression F1 Score: 0.7407407407407408


In [None]:
y_pred_sgd = sgd_model.predict(X_test)

sgd_accuracy = accuracy_score(y_test, y_pred_sgd)
sgd_precision = precision_score(y_test, y_pred_sgd)
sgd_recall = recall_score(y_test, y_pred_sgd)
sgd_f1 = f1_score(y_test, y_pred_sgd)

In [None]:
print("SGD Accuracy:", sgd_accuracy)
print("SGD Precision:", sgd_precision)
print("SGD Recall:", sgd_recall)
print("SGD F1 Score:", sgd_f1)

SGD Accuracy: 0.85
SGD Precision: 0.4444444444444444
SGD Recall: 0.2857142857142857
SGD F1 Score: 0.34782608695652173


In [None]:
y_pred_knn = knn_model.predict(X_test)

knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)

In [None]:
print("KNN Accuracy:", knn_accuracy)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)
print("KNN F1 Score:", knn_f1)

KNN Accuracy: 0.9
KNN Precision: 0.8333333333333334
KNN Recall: 0.35714285714285715
KNN F1 Score: 0.5


In [None]:
y_pred_dt = dt_model.predict(X_test)

dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)

In [None]:
print("Decision Tree Accuracy:", dt_accuracy)
print("Decision Tree Precision:", dt_precision)
print("Decision Tree Recall:", dt_recall)
print("Decision Tree F1 Score:", dt_f1)

Decision Tree Accuracy: 0.98
Decision Tree Precision: 0.875
Decision Tree Recall: 1.0
Decision Tree F1 Score: 0.9333333333333333


In [None]:
y_pred_dt

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0])

In [None]:
import pickle

with open('/content/drive/MyDrive/decision-tree.pkl', "wb") as file:
    pickle.dump(dt_model, file)