Machine Learning: Classification (Supervised Learning)

In [30]:
import pandas as pd
import numpy as np

In [31]:
from sklearn.utils import shuffle

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
  

#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, average_precision_score

In [32]:
pd.read_csv('Dataset/diabetes.csv').head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


Features = { Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age }
Class Type (Outcome) = {0, 1}

In [33]:
D = pd.read_csv('Dataset/diabetes.csv', header=None, skiprows=1)
D = np.array(D)

print(D)
print()
print('Number of sample {}, and number of features {}.'.format(D.shape[0], D.shape[1]-1))

[[  6.    148.     72.    ...   0.627  50.      1.   ]
 [  1.     85.     66.    ...   0.351  31.      0.   ]
 [  8.    183.     64.    ...   0.672  32.      1.   ]
 ...
 [  5.    121.     72.    ...   0.245  30.      0.   ]
 [  1.    126.     60.    ...   0.349  47.      1.   ]
 [  1.     93.     70.    ...   0.315  23.      0.   ]]

Number of sample 768, and number of features 8.


In [34]:
X = D[:,:-1]

Y = D[:,-1]

print(X)
print()
print(X.shape)
print()
print(Y)
print()
print(Y.shape)

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]

(768, 8)

[1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0.
 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0.
 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0.
 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1.

In [35]:
X[:, 0:X.shape[1]] = SimpleImputer(strategy='mean').fit_transform(X[:, 0:X.shape[1]])

In [36]:
 X, Y = shuffle(X, Y, random_state=0)

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75, random_state=111)

In [38]:
print(X_train.shape)
print(X_test.shape)

(576, 8)
(192, 8)


In [39]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [40]:
#model = RandomForestClassifier()
model = SVC(probability=True)


model.fit(X_train, Y_train)

Y_artificial = model.predict(X_test) # Predicted
Y_proba = model.predict_proba(X_test)[:, 1] # Probability

In [43]:
print('Accuracy: {:.4f}%'.format(accuracy_score(y_pred=Y_artificial, y_true=Y_test)*100.0))
print('auROC: {:.4f}'.format(roc_auc_score(Y_test, Y_proba)))

Accuracy: 78.6458%
auROC: 0.8479
