In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [23]:
df = pd.read_csv("D:\\Courses HK2 23-24\\Machine Learning\\LAB\\Week 05\\Practise\\diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,4.0,117.0,64.0,27.0,120.0,33.2,0.23,24.0,0.0
1,2.0,91.0,62.0,0.0,0.0,27.3,0.525,22.0,0.0
2,5.0,101.0,68.0,47.0,71.0,30.2,0.364,24.0,0.0
3,2.0,99.0,52.0,15.0,94.0,24.6,0.637,21.0,0.0
4,2.0,130.0,74.0,55.0,100.0,33.6,0.404,23.0,0.0



* Pregnancies: Number of times pregnant
* Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* BloodPressure: Diastolic blood pressure (mm Hg)
* SkinThickness: Triceps skin fold thickness (mm)
* Insulin: 2-Hour serum insulin (mu U/ml)
* BMI: Body mass index (weight in kg/(height in m)^2)
* DiabetesPedigreeFunction: Diabetes pedigree function
* Age: Age (years)
* Outcome: Class variable (0 or 1)


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               2666 non-null   float64
 1   Glucose                   2661 non-null   float64
 2   BloodPressure             2666 non-null   float64
 3   SkinThickness             2666 non-null   float64
 4   Insulin                   2666 non-null   float64
 5   BMI                       2655 non-null   float64
 6   DiabetesPedigreeFunction  2666 non-null   float64
 7   Age                       2666 non-null   float64
 8   Outcome                   2666 non-null   float64
dtypes: float64(9)
memory usage: 187.6 KB


In [25]:
len(df)

2666

In [26]:
df = df.drop_duplicates(ignore_index=True)
len(df)

2191

In [27]:
X = df.drop(columns=["Outcome"], axis=1)
y = df.Outcome
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, train_size=0.8)

In [28]:
X_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
537,0.0,173.0,78.0,32.0,265.0,46.5,1.159,58.0
437,1.0,117.0,60.0,23.0,106.0,33.8,0.466,27.0
952,2.0,107.0,74.0,40.0,124.0,33.6,0.404,23.0
1409,11.0,120.0,80.0,43.0,231.0,42.3,0.785,48.0
2089,11.0,120.0,80.0,43.0,187.0,44.3,0.785,48.0


In [29]:
#Hàm để mình save lại được các imputer, scaler và model
def save(file, file_name):
    """
    input:
    #file: tên biến chứa model cần lưu
    #file_name: đường dẫn của model
    """
    pickle.dump(file, open(file_name, "wb"))

In [30]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=0, strategy="mean")
zero_cols = ["BloodPressure", "Insulin", "SkinThickness"]
X_train.loc[:, zero_cols] = imputer.fit_transform(X_train[zero_cols])
X_test.loc[:, zero_cols] = imputer.transform(X_test[zero_cols])
save(imputer, "zero.sav")

#Biến những giá trị bằng 0 thành nan của các samples tập training
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
nan_cols = ["Glucose", "BMI"]
X_train.loc[:, nan_cols] = imputer.fit_transform(X_train[nan_cols])
X_test.loc[:, nan_cols] = imputer.transform(X_test[nan_cols])
save(imputer, "nan.sav")

In [31]:
len(X_train), len(X_test)

(1752, 439)

In [32]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_normalized_train = scaler.fit_transform(X_train)
X_normalized_test = scaler.fit(X_test)

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
pipe_line = make_pipeline(StandardScaler(), SVC())
pipe_line.fit(X_train, y_train)

In [34]:
len(X_test)

439

In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pipe_line.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94       254
         1.0       0.95      0.88      0.91       185

    accuracy                           0.93       439
   macro avg       0.93      0.92      0.93       439
weighted avg       0.93      0.93      0.93       439



#Model tuning

In [36]:
#Chọn các parameters để tiến hành fine-tune (default values của SVM là C=1, gamma=scale, kernel=rbf)
from sklearn.model_selection import GridSearchCV
param_grid = {"C": [0.01, 0.1, 1, 10, 100, 1000],
              "gamma": ["scale", 0.001, 0.005, 0.1]}
gridsearch = make_pipeline(StandardScaler(), GridSearchCV(SVC(), param_grid, cv=10, scoring="f1", verbose=1)) #chọn số cv là 10 và score để đánh giá là f1
#Có thể tham khảo thêm svm ở đây
##.  https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [37]:
gridsearch.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [38]:
#in ra param đc cho là tố nhất
gridsearch[1].best_params_

{'C': 100, 'gamma': 0.005}

In [39]:
#Chạy lại svm theo các parameters tốt nhất
pipe_line = make_pipeline(StandardScaler(), SVC(C=100, gamma=0.005))
pipe_line.fit(X_train, y_train)

In [40]:
print(classification_report(y_test, pipe_line.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.91      0.97      0.94       254
         1.0       0.96      0.88      0.92       185

    accuracy                           0.93       439
   macro avg       0.94      0.92      0.93       439
weighted avg       0.93      0.93      0.93       439



In [41]:
import pickle
file_name = "model.sav"
pickle.dump(pipe_line, open(file_name, "wb"))