In [1]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import seaborn as sns
import time

os.getcwd()

# Load training and testing datasets
raw_train = pd.read_csv('train.csv')
raw_test = pd.read_csv('test.csv')


In [2]:
raw_train

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
0,0,3556.0,2489.0,265.19,77.53,176.55,0.00,4.20,307.91,52,0,7515.0,1
1,1,1906.0,134.0,1442.61,551.90,876.07,112.10,168.15,1735.48,20,1,1756.0,0
2,2,1586.0,71.0,1332.74,684.20,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,3,683.0,94.0,419.23,255.80,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,4,1032.0,71.0,1102.72,480.27,625.30,188.78,130.77,1427.97,28,1,1542.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,82,626.0,68.0,1771.57,666.99,1117.48,360.21,118.84,2306.82,42,1,1521.0,0
83,83,1237.0,71.0,1348.53,428.09,924.69,120.02,48.67,1524.78,56,0,1345.0,0
84,84,634.0,1002.0,1300.00,558.00,724.00,67.00,105.00,1484.26,34,0,2926.0,1
85,85,112.0,884.0,942.83,378.49,567.06,116.77,31.81,1104.59,33,1,2352.0,1


In [3]:
# Check whether there are missing value(s) in datasets

print(raw_train.isnull().values.any())
print(raw_test.isnull().values.any())


True
False


In [4]:
# Handle the missing value in training dataset by dropping it

train_no_empty = raw_train.dropna()

In [5]:
train_no_empty

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
0,0,3556.0,2489.0,265.19,77.53,176.55,0.00,4.20,307.91,52,0,7515.0,1
1,1,1906.0,134.0,1442.61,551.90,876.07,112.10,168.15,1735.48,20,1,1756.0,0
2,2,1586.0,71.0,1332.74,684.20,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,3,683.0,94.0,419.23,255.80,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,4,1032.0,71.0,1102.72,480.27,625.30,188.78,130.77,1427.97,28,1,1542.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,82,626.0,68.0,1771.57,666.99,1117.48,360.21,118.84,2306.82,42,1,1521.0,0
83,83,1237.0,71.0,1348.53,428.09,924.69,120.02,48.67,1524.78,56,0,1345.0,0
84,84,634.0,1002.0,1300.00,558.00,724.00,67.00,105.00,1484.26,34,0,2926.0,1
85,85,112.0,884.0,942.83,378.49,567.06,116.77,31.81,1104.59,33,1,2352.0,1


In [6]:
# Check correlation between variables in the dataset
train_no_empty.corr()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
id,1.0,-0.254977,-0.104417,0.044808,0.029387,0.042884,-0.028807,-0.015051,0.024569,-0.015034,-0.173122,-0.081282,-0.010079
MO HLADR+ MFI (cells/ul),-0.254977,1.0,0.135928,0.119233,0.011627,0.204047,0.106839,0.157219,0.129647,0.11376,-0.016148,0.316647,-0.279773
Neu CD64+MFI (cells/ul),-0.104417,0.135928,1.0,-0.252522,-0.242675,-0.229616,-0.31909,-0.250854,-0.297098,0.074813,0.068765,0.692778,0.534729
CD3+T (cells/ul),0.044808,0.119233,-0.252522,1.0,0.922647,0.909429,0.499472,0.682187,0.976536,-0.421796,0.177676,-0.30109,-0.418312
CD8+T (cells/ul),0.029387,0.011627,-0.242675,0.922647,1.0,0.695437,0.445731,0.571847,0.893223,-0.381978,0.199878,-0.288176,-0.371941
CD4+T (cells/ul),0.042884,0.204047,-0.229616,0.909429,0.695437,1.0,0.490756,0.681692,0.902574,-0.35534,0.120464,-0.262928,-0.384135
NK (cells/ul),-0.028807,0.106839,-0.31909,0.499472,0.445731,0.490756,1.0,0.372678,0.657402,0.10282,-0.132514,-0.289564,-0.476298
CD19+ (cells/ul),-0.015051,0.157219,-0.250854,0.682187,0.571847,0.681692,0.372678,1.0,0.734878,-0.339316,0.16421,-0.329614,-0.365903
CD45+ (cells/ul),0.024569,0.129647,-0.297098,0.976536,0.893223,0.902574,0.657402,0.734878,1.0,-0.347545,0.127458,-0.338671,-0.477547
Age,-0.015034,0.11376,0.074813,-0.421796,-0.381978,-0.35534,0.10282,-0.339316,-0.347545,1.0,-0.207745,0.112061,0.026836


In [7]:
# Split attributes for model fitting and label for training dataset

train_no_empty_x = train_no_empty.drop(columns=['label','id','Sex 0M1F','Age'])
train_y = pd.DataFrame(train_no_empty['label'])

print(train_no_empty_x.shape)
print(train_y.shape)

(86, 9)
(86, 1)


In [8]:
# Normalize selected variables with the use of min-max normalization
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

cols_to_norm = ['MO HLADR+ MFI (cells/ul)','Neu CD64+MFI (cells/ul)', 'CD3+T (cells/ul)', 'CD8+T (cells/ul)',
               'CD4+T (cells/ul)','NK (cells/ul)','CD19+ (cells/ul)', 'CD45+ (cells/ul)', 
                'Mono CD64+MFI (cells/ul)']

train_no_empty_x[cols_to_norm] = min_max_scaler.fit_transform(train_no_empty_x[cols_to_norm])


In [9]:
train_no_empty_x

Unnamed: 0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Mono CD64+MFI (cells/ul)
0,0.853955,0.794764,0.051331,0.016293,0.092652,0.000000,0.000000,0.021693,1.000000
1,0.444830,0.033613,0.368112,0.205173,0.565870,0.127671,0.340385,0.335580,0.226253
2,0.365485,0.013251,0.338552,0.257851,0.416494,0.278974,0.440809,0.354173,0.166465
3,0.141582,0.020685,0.092775,0.087275,0.082924,0.082058,0.084043,0.072332,0.184200
4,0.228118,0.013251,0.276666,0.176652,0.396227,0.215002,0.262779,0.267967,0.197501
...,...,...,...,...,...,...,...,...,...
82,0.127449,0.012282,0.456618,0.250998,0.729181,0.410243,0.238010,0.461204,0.194680
83,0.278949,0.013251,0.342800,0.155876,0.598761,0.136691,0.092327,0.289253,0.171033
84,0.129432,0.314156,0.329743,0.207602,0.462996,0.076306,0.209276,0.280343,0.383448
85,0.000000,0.276018,0.233648,0.136126,0.356828,0.132989,0.057323,0.196863,0.306328


In [10]:
# Model fitting

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier

n_estimators = [1,2,3,4,5,6,7,8,9,10]

# To build Random Forest Classifiers
for n in n_estimators:
    # To mark start time for calculating training time.  Also build Random Forest Classifier.
    ran_forest_clf = RandomForestClassifier(n_estimators=n, criterion='entropy', bootstrap=True, random_state=0)
    ran_forest_clf.fit(train_no_empty_x, np.array(train_y).ravel())
    
    # To get the predicted value of label
    y_predict_rf =ran_forest_clf.predict(train_no_empty_x)
    
    ## To calculate selected metrics, using average = marco (as appropriate) for getting unweighted mean
    accuracy = metrics.accuracy_score(train_y, y_predict_rf)
    precision = metrics.precision_score(train_y, y_predict_rf, average="macro", zero_division=0)
    recall = metrics.recall_score(train_y, y_predict_rf, average="macro", zero_division=0)
    f1_score = metrics.f1_score(train_y, y_predict_rf, average="macro")
    
    ## To show the selected metrics and training time with appropriate format
    print("Random Forest classifier with n=", n)
    print("Accuracy:", '%.4f' %accuracy,"; Precision:", '%.4f' %precision, "; Recall:", '%.4f' %recall)
    print("F1 score:",'%.4f' %f1_score)
    print("=========================")

Random Forest classifier with n= 1
Accuracy: 0.9302 ; Precision: 0.9291 ; Recall: 0.9135
F1 score: 0.9206
Random Forest classifier with n= 2
Accuracy: 0.9186 ; Precision: 0.9308 ; Recall: 0.8878
F1 score: 0.9046
Random Forest classifier with n= 3
Accuracy: 0.9419 ; Precision: 0.9384 ; Recall: 0.9307
F1 score: 0.9344
Random Forest classifier with n= 4
Accuracy: 0.9535 ; Precision: 0.9672 ; Recall: 0.9310
F1 score: 0.9460
Random Forest classifier with n= 5
Accuracy: 0.9651 ; Precision: 0.9649 ; Recall: 0.9567
F1 score: 0.9606
Random Forest classifier with n= 6
Accuracy: 0.9767 ; Precision: 0.9831 ; Recall: 0.9655
F1 score: 0.9735
Random Forest classifier with n= 7
Accuracy: 0.9767 ; Precision: 0.9831 ; Recall: 0.9655
F1 score: 0.9735
Random Forest classifier with n= 8
Accuracy: 0.9767 ; Precision: 0.9831 ; Recall: 0.9655
F1 score: 0.9735
Random Forest classifier with n= 9
Accuracy: 0.9767 ; Precision: 0.9831 ; Recall: 0.9655
F1 score: 0.9735
Random Forest classifier with n= 10
Accuracy: 

In [11]:
## Carry our similar data cleaning for testing dataset

test_x = raw_test.drop(columns=['id','Sex 0M1F','Age'])
print(test_x.shape)

# Normalization
test_x[cols_to_norm] = min_max_scaler.transform(test_x[cols_to_norm])

(59, 9)


In [12]:
test_x.fillna(raw_train.median())

Unnamed: 0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Mono CD64+MFI (cells/ul)
0,0.677163,0.040724,0.345488,0.276398,0.404716,0.144709,0.188141,0.30329,0.427784
1,0.080585,0.034583,0.117043,0.092152,0.137652,0.445151,0.195241,0.174473,0.056295
2,0.177039,0.016807,0.311507,0.181888,0.47658,0.587991,0.655961,0.437845,0.17587
3,0.185718,0.006464,0.387013,0.353933,0.372394,0.433067,0.044845,0.378153,0.175333
4,0.259856,0.041047,0.219547,0.146248,0.30438,0.303995,0.173213,0.22917,0.238076
5,0.187206,0.017776,0.427168,0.330039,0.51721,0.251333,0.18349,0.407618,0.1228
6,0.302008,0.027149,0.331882,0.268466,0.384496,0.308653,0.436594,0.361871,0.269381
7,0.094719,0.005818,0.389577,0.231568,0.589831,0.385916,0.208134,0.396611,0.179632
8,0.497644,0.013898,0.308128,0.276939,0.290139,0.081477,0.163767,0.25705,0.31318
9,0.481031,0.002909,0.309202,0.241267,0.355698,0.368628,0.31026,0.330391,0.218863


In [13]:
# Prediction by making reference to model fitting results, then n=6 has been chosen

n_estimators = [6]

# To build Random Forest Classifiers
for n in n_estimators:
    # To mark start time for calculating training time.  Also build Random Forest Classifier.
    ran_forest_clf = RandomForestClassifier(n_estimators=n, criterion='gini', bootstrap=True, random_state=0)
    ran_forest_clf.fit(train_no_empty_x, np.array(train_y).ravel())
    
    # To get the predicted value of label
    y_predict_rf_test =ran_forest_clf.predict(test_x)

In [14]:
# Append prediction results to id
pred_rf = pd.DataFrame(y_predict_rf_test)
results_rf = raw_test.join(pred_rf).rename(columns={0:'label'})
final_rf = results_rf[['id','label']]

In [15]:
# Export results

final_rf.to_csv('prediction_submit_final.csv', index = None)
