In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv('diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

In [4]:
print(X)

[[ 1.  0.  1. ...  4.  6.  8.]
 [ 1.  1.  1. ... 12.  6.  8.]
 [ 0.  0.  1. ... 13.  6.  8.]
 ...
 [ 1.  1.  1. ... 13.  6.  4.]
 [ 1.  1.  1. ... 11.  2.  4.]
 [ 1.  1.  1. ...  9.  6.  2.]]


In [5]:
print(y)

[0. 0. 0. ... 1. 1. 1.]


In [6]:
y.reshape(len(y),1)

array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])

In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')
imputer.fit(X[:, 1:])
X[:,1:] = imputer.transform(X[:, 1:])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [9]:
print(X_train)

[[ 1.  1.  1. ...  8.  6.  5.]
 [ 1.  1.  1. ... 10.  6.  8.]
 [ 1.  1.  1. ...  8.  3.  1.]
 ...
 [ 0.  1.  1. ... 11.  2.  2.]
 [ 1.  1.  1. ... 12.  6.  6.]
 [ 1.  0.  1. ...  7.  4.  1.]]


In [10]:
print(y_train)

[1. 1. 1. ... 1. 1. 1.]


In [11]:
print(X_test)

[[ 1.  0.  1. ...  8.  3.  5.]
 [ 0.  0.  1. ... 10.  5.  4.]
 [ 0.  0.  1. ...  5.  5.  8.]
 ...
 [ 1.  1.  1. ... 10.  6.  7.]
 [ 0.  1.  1. ...  7.  5.  4.]
 [ 1.  0.  1. ... 11.  3.  2.]]


In [12]:
print(y_test)

[0. 0. 1. ... 0. 0. 1.]


In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
print(X_train)

[[ 0.88263338  0.9519085   0.15954018 ... -0.20215105  1.04696006
  -0.32406457]
 [ 0.88263338  0.9519085   0.15954018 ...  0.49811006  1.04696006
   1.05483749]
 [ 0.88263338  0.9519085   0.15954018 ... -0.20215105 -1.8717893
  -2.16260064]
 ...
 [-1.13297325  0.9519085   0.15954018 ...  0.84824061 -2.84470575
  -1.70296662]
 [ 0.88263338  0.9519085   0.15954018 ...  1.19837116  1.04696006
   0.13556945]
 [ 0.88263338 -1.05052114  0.15954018 ... -0.5522816  -0.89887285
  -2.16260064]]


In [15]:
print(X_test)

[[ 0.88263338 -1.05052114  0.15954018 ... -0.20215105 -1.8717893
  -0.32406457]
 [-1.13297325 -1.05052114  0.15954018 ...  0.49811006  0.0740436
  -0.78369859]
 [-1.13297325 -1.05052114  0.15954018 ... -1.25254271  0.0740436
   1.05483749]
 ...
 [ 0.88263338  0.9519085   0.15954018 ...  0.49811006  1.04696006
   0.59520347]
 [-1.13297325  0.9519085   0.15954018 ... -0.5522816   0.0740436
  -0.78369859]
 [ 0.88263338 -1.05052114  0.15954018 ...  0.84824061 -1.8717893
  -1.70296662]]


In [24]:
from sklearn.svm import SVC
classifier = SVC(C=1.0, kernel='rbf', degree=3)
classifier.fit(X_train, y_train)

In [27]:
from sklearn.model_selection import cross_val_score, KFold
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=0)

In [31]:
cross_val_results = cross_val_score(classifier, X_test, y_test, cv=kf)

In [32]:
y_pred = classifier.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)*100

75.40881570757652

In [20]:
classifier.predict([[1,1,1,40,1,0,0,0,0,1,0,1,0,5,18,15,1,0,9,4,3]])

array([0.])