In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings(action='ignore')

- HAEMATOCRIT /Continuous /35.1 / Patient laboratory test result of haematocrit
- HAEMOGLOBINS/Continuous/11.8 / Patient laboratory test result of haemoglobins
- ERYTHROCYTE/Continuous/4.65 /  Patient laboratory test result of erythrocyte
- LEUCOCYTE	/Continuous /6.3 / Patient laboratory test result of leucocyte
- THROMBOCYTE/Continuous/310/ Patient laboratory test result of thrombocyte
- MCH/Continuous /25.4/ Patient laboratory test result of MCH
- MCHC/Continuous/33.6/ Patient laboratory test result of MCHC
- MCV/Continuous /75.5/ Patient laboratory test result of MCV
- AGE/Continuous/12/ Patient age
- SEX/Nominal – Binary/F/ Patient gender
- SOURCE/Nominal/ {in,out}/The class target in.= in care patient, out = out care patient


In [2]:
data = pd.read_csv('data-ori.csv')

In [3]:
data

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,F,out
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,F,out
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,F,out
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,M,out
...,...,...,...,...,...,...,...,...,...,...,...
4407,32.8,10.4,3.49,8.1,72,29.8,31.7,94.0,92,F,in
4408,33.7,10.8,3.67,6.7,70,29.4,32.0,91.8,92,F,in
4409,33.2,11.2,3.47,7.2,235,32.3,33.7,95.7,93,F,out
4410,31.5,10.4,3.15,9.1,187,33.0,33.0,100.0,98,F,in


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4412 entries, 0 to 4411
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HAEMATOCRIT   4412 non-null   float64
 1   HAEMOGLOBINS  4412 non-null   float64
 2   ERYTHROCYTE   4412 non-null   float64
 3   LEUCOCYTE     4412 non-null   float64
 4   THROMBOCYTE   4412 non-null   int64  
 5   MCH           4412 non-null   float64
 6   MCHC          4412 non-null   float64
 7   MCV           4412 non-null   float64
 8   AGE           4412 non-null   int64  
 9   SEX           4412 non-null   object 
 10  SOURCE        4412 non-null   object 
dtypes: float64(7), int64(2), object(2)
memory usage: 379.3+ KB


## 1. PREPROCESSING

In [6]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Binary encoding
    df['SEX'] = df['SEX'].replace({'F': 0, 'M': 1})
    
    # Split df into X and y
    y = df['SOURCE']
    X = df.drop('SOURCE', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [8]:
X_train

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX
3512,-0.351149,-0.368628,-0.594901,-0.902950,-1.037460,0.411407,-0.180468,0.574485,0.953749,0.960001
2388,1.599311,1.504205,1.584296,0.581091,1.625958,-0.296269,-0.018062,-0.311836,0.212817,-1.041665
1720,-1.191865,-1.040927,-1.334044,-0.824843,0.220506,0.746621,0.550362,0.632604,-0.342883,-1.041665
1466,-0.166192,-0.272585,-0.021428,0.424876,0.819124,-0.482500,-0.505282,-0.340896,-0.528116,0.960001
1546,-0.687435,-0.560714,-0.684108,-0.102349,-0.343410,0.262422,0.469159,0.080470,-0.481807,-1.041665
...,...,...,...,...,...,...,...,...,...,...
2895,0.035580,0.063564,-0.238074,-0.785789,0.524153,0.485899,0.144345,0.516365,0.583283,-1.041665
2763,0.001951,-0.272585,-0.531182,-0.707682,0.177128,0.485899,-1.154909,1.155679,0.490666,-1.041665
905,1.330282,1.456184,0.755946,-0.492886,0.897205,0.932852,0.712769,0.748843,-0.944890,0.960001
3980,-0.536107,-0.560714,-0.556670,-0.707682,-0.421490,0.001700,-0.261672,0.094999,1.324215,0.960001


In [9]:
y_train.value_counts()

out    1957
in     1352
Name: SOURCE, dtype: int64

## 2. Training

In [10]:
models = {
    "Logistic Regression": LogisticRegression(),
    "      Decision Tree": DecisionTreeClassifier(),
    "     Neural Network": MLPClassifier(),
    "      Random Forest": RandomForestClassifier(),
    "  Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

Logistic Regression trained.
      Decision Tree trained.
     Neural Network trained.
      Random Forest trained.
  Gradient Boosting trained.


## 3. Results

In [11]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(name + " Accuracy: {:.2f}%".format(acc * 100))

Logistic Regression Accuracy: 71.08%
      Decision Tree Accuracy: 66.46%
     Neural Network Accuracy: 73.16%
      Random Forest Accuracy: 73.62%
  Gradient Boosting Accuracy: 73.16%


In [12]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, pos_label='in')
    print(name + " F1-Score: {:.5f}".format(f1))

Logistic Regression F1-Score: 0.59466
      Decision Tree F1-Score: 0.58145
     Neural Network F1-Score: 0.64165
      Random Forest F1-Score: 0.65150
  Gradient Boosting F1-Score: 0.63547
