### Diabetes data: 3 class classification

In [20]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('diabetic_data.csv')
df = df.drop(['encounter_id', 'age', 'patient_nbr', 
              'weight', 'payer_code', 'medical_specialty', 
              'race', 'diag_1', 'diag_2', 'diag_3'], axis=1)
df = df.loc[:, ~(df == '?').any()]
df_x = df.drop(['readmitted'], axis=1)
df_y = df['readmitted']
y_full = np.array(df_y.values)
y_full[y_full=='NO'] = 0
y_full[y_full=='>30'] = 1
y_full[y_full=='<30'] = 2
y_full = np.array(y_full).astype('int32')
categorical_variables = df_x.select_dtypes(include='object').columns.tolist()
continuous_variables = df_x.select_dtypes(exclude='object').columns.tolist()



from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
ct_ss_ohe = ColumnTransformer(
     [('scaling', StandardScaler(), continuous_variables), 
      ('onehot', OneHotEncoder(sparse=False), categorical_variables)])
x_full = ct_ss_ohe.fit_transform(df_x)
x_train, x_test, y_train, y_test = train_test_split(x_full, y_full, random_state=0)

In [3]:
print (x_train.shape)
print (y_train.shape)
print (x_test.shape)
print (y_test.shape)

(76324, 97)
(76324,)
(25442, 97)
(25442,)


In [4]:
print (x_train.shape)
print (x_train)

(76324, 97)
[[-0.70846069 -0.5143125   0.30648245 ...  1.          1.
   0.        ]
 [ 4.13450367 -0.5143125   0.30648245 ...  0.          0.
   1.        ]
 [-0.70846069 -0.5143125   0.30648245 ...  0.          0.
   1.        ]
 ...
 [ 0.67524341 -0.5143125  -0.43169547 ...  1.          1.
   0.        ]
 [ 0.67524341 -0.5143125  -1.16987339 ...  0.          0.
   1.        ]
 [ 2.75079957  0.6220211   0.30648245 ...  1.          1.
   0.        ]]


In [5]:
print (x_test.shape)
print (x_test)

(25442, 97)
[[ 0.67524341 -0.5143125  -1.16987339 ...  0.          0.
   1.        ]
 [-0.70846069 -0.5143125   0.30648245 ...  0.          0.
   1.        ]
 [-0.01660864 -0.5143125  -1.16987339 ...  0.          0.
   1.        ]
 ...
 [-0.01660864 -0.5143125  -1.16987339 ...  1.          1.
   0.        ]
 [-0.70846069  0.24324324  0.30648245 ...  0.          0.
   1.        ]
 [-0.70846069 -0.5143125   0.30648245 ...  0.          0.
   1.        ]]


### KNN

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier(n_neighbors=6)
knn = knn.fit(x_train, y_train)

In [26]:
print (knn.score(x_test, y_test))

0.5382438487540288


In [30]:
y_pred_knn = knn.predict_proba(x_test)
print (y_pred_knn.shape)
print (y_pred_knn)

(25442, 3)
[[0.5        0.5        0.        ]
 [0.5        0.5        0.        ]
 [0.83333333 0.16666667 0.        ]
 ...
 [0.83333333 0.         0.16666667]
 [0.5        0.33333333 0.16666667]
 [0.33333333 0.66666667 0.        ]]


### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
lr = LogisticRegression(solver='lbfgs', max_iter=10000)
lr.fit(x_train, y_train)
lr_scores = cross_val_score(lr, x_test, y_test, cv=10, scoring='accuracy')

In [10]:
print (np.mean(lr_scores))

0.569215844361246


In [28]:
y_pred_lr = lr.decision_function(x_test)
print (y_pred_lr.shape)
print (y_pred_lr)

(25442, 3)
[[ 0.7751157   0.24937677 -1.02449247]
 [ 0.55388775  0.37130111 -0.92518886]
 [ 0.9614593   0.19345379 -1.15491309]
 ...
 [ 0.47054311  0.27261721 -0.74316031]
 [ 1.01334976  0.12645978 -1.13980954]
 [ 0.38637124  0.46450808 -0.85087933]]


### Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
nb = GaussianNB()
nb.fit(x_train, y_train)
nb_scores = cross_val_score(nb, x_test, y_test, cv=10, scoring='accuracy')

In [12]:
print (np.mean(nb_scores))
print (nb.score(x_test, y_test))

0.11685392494841285
0.11587139375835233


In [37]:
y_pred_nb = nb.predict_proba(x_test)
print (y_pred_nb.shape)
print (y_pred_nb)

(25442, 3)
[[7.46156346e-26 4.01335493e-23 1.00000000e+00]
 [4.14391706e-28 6.32727707e-23 1.00000000e+00]
 [1.13613295e-24 5.00211656e-22 1.00000000e+00]
 ...
 [7.10969542e-26 5.93445576e-23 1.00000000e+00]
 [2.12002489e-24 1.69927469e-35 1.00000000e+00]
 [1.99843844e-25 7.76829470e-22 1.00000000e+00]]


### Decision Tree

In [13]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt = dt.fit(x_train, y_train)
dt_scores = cross_val_score(dt,x_test,y_test,cv=10,scoring='accuracy')

In [14]:
print (np.mean(dt_scores))

0.4686738085529648


In [36]:
y_pred_dt = dt.predict_proba(x_test)
print (y_pred_dt.shape)
print (y_pred_dt)

(25442, 3)
[[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


### SVM

In [18]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
lsvm = SVC()
lsvm.C = 10
lsvm = lsvm.fit(x_train, y_train)
print (lsvm.score(x_test, y_test))

0.5773524094017766


In [33]:
y_pred_svm = lsvm.decision_function(x_test)
print (y_pred_svm.shape)
print (y_pred_svm)

(25442, 3)
[[ 2.21371733  1.06291589 -0.22293456]
 [ 2.1852257   1.16431977 -0.22990501]
 [ 2.22871032  0.9383186  -0.22068197]
 ...
 [ 1.0732407   2.21549719 -0.22616534]
 [ 2.21801657  0.91012206 -0.20113231]
 [ 1.12509171  2.19496607 -0.22258225]]


### MLP

In [19]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=0, max_iter=10000).fit(x_train, y_train)
print (mlp.score(x_test, y_test))

0.5697272227026177


In [34]:
y_pred_mlp = mlp.predict_proba(x_test)
print (y_pred_mlp.shape)
print (y_pred_mlp)

(25442, 3)
[[0.4369223  0.500584   0.0624937 ]
 [0.49267918 0.33066982 0.176651  ]
 [0.68632613 0.13214772 0.18152616]
 ...
 [0.14351615 0.7988755  0.05760836]
 [0.82646051 0.11915431 0.05438518]
 [0.32341545 0.57849808 0.09808647]]


In [38]:
from sklearn.metrics import roc_curve, auc
lr_fpr, lr_tpr, threshold = roc_curve(y_test, y_pred_lr)

ValueError: multiclass format is not supported