# Importing required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import utils
from sklearn import metrics
from sklearn.utils import shuffle

# Loading data

In [2]:
patient_data  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/patient_treat_class/training_set_label.csv" )

# Visualizing the dataset

In [3]:
patient_data.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,33.8,11.1,4.18,4.6,150,26.6,32.8,80.9,33,F,1
1,44.6,14.0,6.86,6.3,232,20.4,31.4,65.0,36,M,0
2,42.9,14.0,4.57,6.2,336,30.6,32.6,93.9,70,F,0
3,41.9,14.4,4.67,3.5,276,30.8,34.4,89.7,18,F,0
4,40.6,13.3,4.85,14.9,711,27.4,32.8,83.7,36,M,0


In [4]:
patient_data.shape

(3309, 11)

Does the dataset contain null values?

In [5]:
patient_data.isnull().sum()

HAEMATOCRIT     0
HAEMOGLOBINS    0
ERYTHROCYTE     0
LEUCOCYTE       0
THROMBOCYTE     0
MCH             0
MCHC            0
MCV             0
AGE             0
SEX             0
SOURCE          0
dtype: int64

Such a relief, no null values

In [6]:
patient_data.describe()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SOURCE
count,3309.0,3309.0,3309.0,3309.0,3309.0,3309.0,3309.0,3309.0,3309.0,3309.0
mean,38.226111,12.74935,4.544802,8.715533,258.893019,28.230039,33.336476,84.611333,46.644303,0.398005
std,5.971943,2.084325,0.78451,4.991299,112.676139,2.69652,1.247055,6.916079,21.874106,0.489561
min,13.7,3.8,1.48,1.1,10.0,14.9,26.0,54.0,1.0,0.0
25%,34.3,11.4,4.04,5.7,191.0,27.2,32.7,81.5,29.0,0.0
50%,38.7,12.9,4.58,7.6,257.0,28.7,33.4,85.3,48.0,0.0
75%,42.5,14.2,5.06,10.3,322.0,29.8,34.1,88.8,64.0,1.0
max,69.0,18.9,7.86,76.6,1121.0,40.8,38.4,115.6,99.0,1.0


# Shuffle the dataset

In [7]:
patient_data = shuffle(patient_data)

what are the name of the columns to use them in the spliting

In [8]:
patient_data.columns

Index(['HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE',
       'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE', 'SEX', 'SOURCE'],
      dtype='object')

what are the types of the columns

In [56]:
patient_data.dtypes

HAEMATOCRIT     float64
HAEMOGLOBINS    float64
ERYTHROCYTE     float64
LEUCOCYTE       float64
THROMBOCYTE       int64
MCH             float64
MCHC            float64
MCV             float64
AGE               int64
SEX               int64
SOURCE            int64
dtype: object

# Preprocessing the dataset

The column SEX contains string values, we need to encode the column. First, let's check the unique values of the column SEX

In [45]:
patient_data.SEX.unique()

array(['F', 'M'], dtype=object)

we have two unique values F and M, if we had F,f,M,m. in that case we needed to do pretraitement.

In [9]:
le = LabelEncoder()
patient_data['SEX'] = le.fit_transform(patient_data['SEX'])

review the dataset

In [47]:
patient_data.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
90,43.2,13.7,5.08,7.2,239,27.0,31.7,85.0,69,0,1
341,39.8,13.8,4.47,4.7,207,30.9,34.7,89.0,37,0,0
2130,43.0,13.8,5.27,4.3,277,26.2,32.1,81.6,3,0,0
3251,51.9,17.4,5.94,7.9,245,29.3,33.5,87.4,51,1,1
3085,38.4,13.6,4.39,10.4,313,31.0,35.4,87.5,29,0,0


# Split the dataset into train and test 

the target column is SOURCE and the others are the inputs/features

In [10]:
input = patient_data.drop('SOURCE', axis='columns')
target = patient_data['SOURCE']

display the input

In [11]:
input

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX
2951,49.7,16.7,6.47,4.4,227,25.8,33.6,76.8,31,1
1893,27.8,9.0,3.14,5.5,185,28.7,32.4,88.5,76,0
2364,36.6,13.1,4.03,13.9,150,32.5,35.8,90.8,72,0
2008,35.2,11.8,4.12,8.2,250,28.6,33.5,85.4,37,0
1064,36.2,12.0,4.65,5.5,191,25.8,33.1,77.8,6,1
...,...,...,...,...,...,...,...,...,...,...
1247,43.5,14.7,5.81,12.1,342,25.3,33.8,74.9,38,0
832,47.2,16.2,5.52,17.0,275,29.3,34.3,85.5,36,1
2737,37.4,12.7,4.82,9.3,409,26.3,34.0,77.6,2,0
229,37.1,11.7,5.01,6.9,391,23.4,31.5,74.1,23,0


display the target

In [50]:
target

90      1
341     0
2130    0
3251    1
3085    0
       ..
795     0
3185    0
828     1
1890    0
2259    0
Name: SOURCE, Length: 3309, dtype: int64

split the dataset 80% train and 20% test

In [12]:
x_train,x_test,y_train,y_test=train_test_split(input,target,test_size=0.2,random_state=42)

TreeClassifier algorithm

In [55]:
clf = DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 67.37160120845923


What if we use the RMSE metric, will anything change?

In [57]:
clf = DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("Accuracy:",metrics.mean_squared_error(y_test, y_pred,squared=False)*100)

Accuracy: 56.58990120532994


The accuracy metric is much better

what if we use the Regressor algorithm

In [59]:
clf = DecisionTreeRegressor()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 66.76737160120845


Still the DecisionTreeClassifier is the best, what if we use SVM algorithm?

In [14]:
from sklearn import svm
clf = svm.SVC(kernel='linear') 
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 71.29909365558912


SVM performs much better, we will keep the result of SVM obviously 

# Predict the unseen data

load the inputs to predict

In [24]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/patient_treat_class/testing_set_label.csv')

In [25]:
test_data.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX
0,41.2,14.3,3.99,4.1,296,35.8,34.7,103.3,24,M
1,25.4,9.1,3.17,19.3,304,28.7,35.8,80.1,66,M
2,47.5,15.2,6.23,11.5,385,24.4,32.0,76.2,17,F
3,43.0,14.5,5.35,4.8,134,27.1,33.7,80.4,14,M
4,40.7,13.6,4.77,4.3,199,28.5,33.4,85.3,24,M


In [26]:
test_data.shape

(1103, 10)

In [27]:
le = LabelEncoder()
test_data['SEX'] = le.fit_transform(test_data['SEX'])

we need to predict the target for the the 5 first inputs

In [33]:
x = test_data.head(14)

In [29]:
x

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX
0,41.2,14.3,3.99,4.1,296,35.8,34.7,103.3,24,1
1,25.4,9.1,3.17,19.3,304,28.7,35.8,80.1,66,1
2,47.5,15.2,6.23,11.5,385,24.4,32.0,76.2,17,0
3,43.0,14.5,5.35,4.8,134,27.1,33.7,80.4,14,1
4,40.7,13.6,4.77,4.3,199,28.5,33.4,85.3,24,1


In [34]:
x.shape

(14, 10)

In [35]:
y_pred = clf.predict(x)

In [36]:
y_pred

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1])

we have three wrong predictions

load the real target to compare

In [38]:
import numpy as np
y = np.array([0, 1, 0, 1,1,0,0,1,0,0,0,1,0,1])
y

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1])

In [39]:
print("Accuracy:",metrics.accuracy_score(y, y_pred)*100)

Accuracy: 78.57142857142857


let's now predict the whole

In [40]:
y_pred = clf.predict(test_data)

In [41]:
y_pred

array([0, 1, 0, ..., 0, 0, 0])

In [43]:
np.savetxt("sample_submission.csv", y_pred, delimiter=",")

Hope you enjoyed my notebook ^^