In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('diabetes.csv')

In [4]:
col_list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for col in col_list:
    data[col] = np.where(data[col] == 0, np.nan, data[col])

In [5]:
def ageRange(age):
    if 21 <= age < 30:
        return 1
    elif 30 <= age < 40:
        return 2
    elif 40 <= age < 50:
        return 3
    elif 50 <= age < 60:
        return 4
    elif 60 <= age < 70:
        return 5
    else:
        return 6

In [6]:
data['Age2'] = data['Age'].apply(ageRange)

In [7]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Age2
0,6,148.0,72.0,35.0,,33.6,0.627,50,1,4
1,1,85.0,66.0,29.0,,26.6,0.351,31,0,2
2,8,183.0,64.0,,,23.3,0.672,32,1,2
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0,1
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1,2
...,...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0,5
764,2,122.0,70.0,27.0,,36.8,0.340,27,0,1
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0,2
766,1,126.0,60.0,,,30.1,0.349,47,1,3


In [8]:
col_list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for col in col_list:
    data[col].fillna(data.groupby('Age2')[col].transform('median'), inplace=True)

In [9]:
data['Pregnancies2']= np.where(data['Pregnancies'] == 0, 0, 1)

In [10]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Age2,Pregnancies2
0,6,148.0,72.0,35.0,192.0,33.6,0.627,50,1,4,1
1,1,85.0,66.0,29.0,140.0,26.6,0.351,31,0,2,1
2,8,183.0,64.0,32.0,140.0,23.3,0.672,32,1,2,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0,1,1
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0,5,1
764,2,122.0,70.0,27.0,105.0,36.8,0.340,27,0,1,1
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0,2,1
766,1,126.0,60.0,31.0,131.0,30.1,0.349,47,1,3,1


In [22]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [13]:
x = data.drop('Outcome', axis = 1)

In [14]:
y = data['Outcome']

In [15]:
scaler = StandardScaler()

In [16]:
xs = scaler.fit_transform(x)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(xs, y, test_size= 0.2, random_state = 0, stratify = y)

In [18]:
print(x_train.shape, x_test.shape)

(614, 10) (154, 10)


In [19]:
print(y_train.shape, y_test.shape)

(614,) (154,)


In [23]:
model1 = SVC()
model2 = LogisticRegression()
model3 = LogisticRegressionCV()

In [32]:
model_C = []

for i in range(1,10):
    temp = []
    model = SVC(C = i)
    model.fit(x_train, y_train)
    train = model.score(x_train, y_train)
    test = model.score(x_test, y_test)
    temp.append(i)
    temp.append(train)
    temp.append(test)
    
    model_C.append(temp)

In [34]:
model_c_select = pd.DataFrame(model_C, columns= ['C', 'train', 'test'])

In [36]:
model_c_select.sort_values(by=['test', 'train'])

Unnamed: 0,C,train,test
0,1,0.811075,0.779221
1,2,0.832248,0.785714
2,3,0.84202,0.798701
6,7,0.863192,0.805195
7,8,0.869707,0.805195
8,9,0.87785,0.805195
3,4,0.846906,0.811688
4,5,0.85342,0.811688
5,6,0.858306,0.811688


In [38]:
model4 = SVC(C=1)
model5 = SVC(C=1, gamma ='auto')

In [24]:
d = [(x_train, y_train), (x_test, y_test)]

In [39]:
def trainModel(model, d):
    model.fit(x_train, y_train)
    for x, y in d:
        print(model.score(x, y))

In [40]:
trainModel(model1,d)

0.8110749185667753
0.7792207792207793


In [41]:
trainModel(model2,d)

0.7736156351791531
0.7597402597402597


In [42]:
trainModel(model3,d)

0.7736156351791531
0.7662337662337663


In [43]:
trainModel(model4, d)

0.8110749185667753
0.7792207792207793


In [44]:
trainModel(model5, d)

0.8110749185667753
0.7792207792207793


In [45]:
import pickle

In [46]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [47]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model1, f)