## Link to drive, import libraries and load datasets

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [25]:
train = pd.read_csv('/content/drive/MyDrive/HKUST/5001/Individual/train.csv')
test = pd.read_csv('/content/drive/MyDrive/HKUST/5001/Individual/test.csv')

In [26]:
train.head()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
0,0,3556.0,2489.0,265.19,77.53,176.55,0.0,4.2,307.91,52,0,7515.0,1
1,1,1906.0,134.0,1442.61,551.9,876.07,112.1,168.15,1735.48,20,1,1756.0,0
2,2,1586.0,71.0,1332.74,684.2,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,3,683.0,94.0,419.23,255.8,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,4,1032.0,71.0,1102.72,480.27,625.3,188.78,130.77,1427.97,28,1,1542.0,0


In [27]:
train.shape

(87, 13)

## Preprocessing

In [28]:
def preprocess(data, type, thres):
  # fill missing values with mean
  # not dropping due to limited number of training data
  df = data.fillna(data.mean()) 

  # drop non-feature column
  df.drop(['id'], axis=1, inplace=True)

  cols = list(df.columns)
  # Remove unnecessary columns for normalization
  cols.remove('Age')
  cols.remove('Sex 0M1F')
  scaler = StandardScaler()

  if type == 'train':
    # Remove label columns
    cols.remove('label')
    # drop row with outliers in z-score
    for col in cols:
      high = df[col].mean() + thres * df[col].std()
      low = df[col].mean() - thres * df[col].std()
      df = df[(df[col] < high) & (df[col] > low)]

    df[cols] = scaler.fit_transform(df[cols])
    x = df.loc[:, df.columns != 'label']
    y= df.loc[:, 'label']
    return x, y

  elif type == 'test':
    df[cols] = scaler.fit_transform(df[cols])
    x = df.loc[:, df.columns != 'label']
    return x

  else:
    print('Warning: Unidentified operation')
    return

## Train the model

In [29]:
thres = 3 # threshold of z-score
x_train, y_train = preprocess(train, 'train', thres)
x_train.head()

Unnamed: 0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul)
1,1.229228,-0.346061,1.303035,0.527603,1.741695,-0.574306,1.036606,1.017448,20,1,-0.235424
2,0.708771,-0.572773,1.047013,1.115698,0.84974,0.485995,1.796231,1.172502,28,1,-0.713635
3,-0.759895,-0.490005,-1.081675,-0.788608,-1.14208,-0.893953,-0.902425,-1.177918,55,1,-0.571784
4,-0.192271,-0.572773,0.511014,0.209197,0.728718,0.037691,0.449573,0.45358,28,1,-0.465395
5,-1.065663,-0.313673,1.947833,2.012463,1.607867,0.697257,0.271955,1.765911,42,1,-0.159125


In [30]:
model_RF = RandomForestClassifier(n_estimators=200, criterion='gini', oob_score=True, random_state=5001)
model_RF.fit(x_train, y_train)

RandomForestClassifier(n_estimators=200, oob_score=True, random_state=5001)

## Test the model

In [31]:
x_test = preprocess(test, 'test', thres)
y_pred = model_RF.predict(x_test)
output = pd.DataFrame(y_pred.astype(np.int32), columns=['label'])
output['id'] = range(output.shape[0])
output = output.loc[:,['id','label']]
output.to_csv('/content/drive/MyDrive/HKUST/5001/Individual/submission.csv',index=False)

In [32]:
output

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0
