In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
# !pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport

In [2]:
df = pd.read_csv('diabetes.csv')

In [None]:
profile = ProfileReport(df)
profile.to_html()

In [3]:
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# Since we have different scale value for different columns we use StandardScaler for this 
from sklearn.preprocessing import StandardScaler

In [4]:
x = df.drop(columns='Outcome',axis=1)
y= df['Outcome']

In [None]:
std_model = StandardScaler()
standard_scaler = std_model.fit_transform(x)

In [None]:
scaled_df=pd.DataFrame(standard_scaler,columns=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'])

In [None]:
scaled_df

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [7]:
x_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
60,2,84,0,0,0,0.0,0.304,21
618,9,112,82,24,0,28.2,1.282,50
346,1,139,46,19,83,28.7,0.654,22
294,0,161,50,0,0,21.9,0.254,65
231,6,134,80,37,370,46.2,0.238,46
...,...,...,...,...,...,...,...,...
71,5,139,64,35,140,28.6,0.411,26
106,1,96,122,0,0,22.4,0.207,27
270,10,101,86,37,0,45.6,1.136,38
435,0,141,0,0,0,42.4,0.205,29


In [8]:
y_train

60     0
618    1
346    0
294    0
231    1
      ..
71     0
106    0
270    1
435    1
102    0
Name: Outcome, Length: 614, dtype: int64

In [9]:
from sklearn.svm import SVC

In [10]:
svm = SVC(kernel='linear')

In [11]:
svm

In [12]:
svm.fit(x_train,y_train)

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
x_train_predict = svm.predict(x_train)
x_train_accruracy = accuracy_score(x_train_predict,y_train)

In [15]:
x_test_predict = svm.predict(x_test)
x_test_accuracy = accuracy_score(x_test_predict,y_test)

In [16]:
x_train_accruracy

0.7736156351791531

In [17]:
x_test_accuracy 

0.7532467532467533

In [19]:
param_grid = {'C': [1, 10, 100, 1000,10000], 
              'kernel': ['linear', 'rbf', 'sigmoid']}


In [18]:
from sklearn.model_selection import GridSearchCV

In [20]:
grid_search = GridSearchCV(SVC(),param_grid=param_grid,cv=5,n_jobs=-1,verbose=3)

In [21]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [None]:
input_data = (8,183,64,0,0,23.3,0.672,32)
# converting the the data in numpy array
input_data_np = np.asarray(input_data)
# reshaping the array as we are predicting for one instance
input_data_np_reshape = input_data_np.reshape(1,-1)
# transform the data into standarization
std_data = std_model.transform(input_data_np_reshape)
predict = svm.predict(std_data)
if (predict[0] ==0):
    print("The person is not diabetic")
else:
    print("The person is diabetic")

In [22]:
grid_search.best_params_

{'C': 1, 'kernel': 'linear'}

In [23]:
round(grid_search.best_score_*100,2)

76.55

In [24]:
import pickle

In [25]:
filename = 'training_model.sav'
pickle.dump(svm,open(filename,'wb'))

In [26]:
loader = pickle.load(open(filename,'rb'))

In [27]:
loader

In [29]:
dir(loader)

['C',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_tags__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_proba',
 '_compute_kernel',
 '_decision_function',
 '_dense_decision_function',
 '_dense_fit',
 '_dense_predict',
 '_dense_predict_proba',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_dual_coef_',
 '_estimator_type',
 '_gamma',
 '_get_coef',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',

In [33]:
input_data = (8,183,64,0,0,23.3,0.672,32)
# converting the the data in numpy array
input_data_np = np.asarray(input_data)
# reshaping the array as we are predicting for one instance
input_data_np_reshape = input_data_np.reshape(1,-1)
predict = loader.predict(input_data_np_reshape)
if (predict[0] == 0):
    print("The person is not diabetic")
else:
    print("The person is diabetic")

The person is diabetic


