<a href="https://colab.research.google.com/github/oendnsk675/svm-diabetes/blob/main/UAS_Data_Mining_Prediksi_Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/dataset/diabetes.csv'

In [None]:
df = pd.read_csv(path)

In [None]:
# df.Outcome.value_counts()
df.head()

In [None]:
columns = df.columns[1:6]
columns

In [None]:
df.head(3)

# Data Preproccesing

### Hapus anomalia, seperti :


1. Glucose yang 0 karna tidak mungkin, diganti dengan median
2. BloodPressure yang 0 karna tidak mungkin, diganti dengan median
3. SkinThickness yang 0 karna tidak mungkin, diganti dengan median
4. Insulin yang 0 karna tidak mungkin, diganti dengan median
5. BMI (body mass index) yang 0  karna tidak mungkin, diganti dengan median
6. Pregnancies (ini kita hapus karna tidak menjelaskan sedang hamil atau gimana)




#### step 1 - 5

In [None]:
def replace_zero_value(column):
  global df

  median = df[column].median()
  df.replace({column : {0 : median}}, inplace=True)

for column in columns:
  replace_zero_value(column)

In [None]:
df[df['BloodPressure'] == 0] # check apakah sudah terhapus

#### step 6

In [None]:
df.drop(columns='Pregnancies', inplace=True)

## EDA (EXPLORATORY DATA ANALYSIS)

### Check Outlier

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(20,10))

columns_2d = np.array(df.columns[:8])
columns_2d = columns_2d.reshape(2, 4)

for row in range(2):
  for col in range(4):
    if columns_2d[row][col] == 'Outcome':
      continue
    sns.boxplot(y=df[columns_2d[row][col]], x=df['Outcome'], hue=df['Outcome'], color='coral', ax=ax[row][col])

In [None]:
q1=df.Glucose.quantile(0.25)
q3=df.Glucose.quantile(0.75)
iqr=q3-q1

lower = q1 - 1.5 * iqr
upper = q1 + 1.5 * iqr

In [None]:
df.loc[(df['Glucose'] > upper) ]

#### Kesimpulan dari data outlier
Dari data outlier diatas kita biarkan karna hal tersebut memang benar adanya bentuk datanya, misal outlier pada glucose yang dimana itu nilai yang sebenernya untuk orang yang mengalami diabetes

### Check correlation between column

In [None]:
fig, ax = plt.subplots(figsize=(13, 7))
sns.heatmap(df.corr(), annot=True, ax=ax, fmt='g',cmap='BuGn')

#### Kesimpulan correlation
*   Terlihat kolom glucose merupakan feature paling berpengaruh dalam penentuan patients terkena penyakit diabetes atau tidak
*   Kemudian juga glucose sangat erat kaitanya dengan insulin karna hal ini memang benar adanya jika insulin terganggu maka glucose sesorang akan mengalami gangguan, karna Insulin adalah hormon yang disekresi dari pankreas
dan dibutuhkan dalam proses metabolisme glukosa.
Saat insulin tidak bekerja sebagaimana fungsinya
maka terjadi penumpukan glukosa di sirkulasi darah
atau hiperglikemia



# Modeling

In [None]:
x = df.iloc[:,0:7]
y = df.iloc[:,7:8]

## Handle class imbalance
Atasi class yang tidak balance menggunakan metode SMOTEENN

In [None]:
df.Outcome.value_counts()

In [None]:
from imblearn.combine import SMOTEENN

SE = SMOTEENN()
x_se, y_se = SE.fit_resample(x, y)
print('Data Before Sampling:\n', y.Outcome.value_counts(), "\n")
print('Data After Sampling:\n', y_se.Outcome.value_counts())

## Membagi data training dengan data testing
membagi dataset menjadi data training dengan data testing sangatlah penting, supaya data yang digunakan untuk melatih model berbeda dengan saat testing modelnya, hal ini menghindari model yang overfitting atau model yang terlihat bagus tapi tidak sebenarnya bagus, yang dikarnakan data yang digunakan ketika training sama dengan ketika testing

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)
xse_train, xse_test, yse_train, yse_test = train_test_split(x_se,y_se,test_size=0.2,random_state=0)

## Training model

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='linear')

#Train the model using the training sets
model.fit(x_train, y_train.Outcome.ravel())

model_se = SVC(kernel='linear')

#Train the model using the training sets
model_se.fit(xse_train, yse_train.Outcome.ravel())

## Model Predict

In [None]:
#Predict the response for test dataset
y_pred = model.predict(x_test)

yse_pred = model_se.predict(xse_test)

## Check accuracy, presisi, recall

### check using confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)
cm_se = confusion_matrix(yse_test, yse_pred)

# display(cm)
# display(cm_se)

#### visualization confusion matrix

In [None]:
# sns.heatmap(cm, annot=True, ax=ax[0], fmt='g',cmap='BuGn')
# sns.heatmap(cm_se, annot=True, ax=ax[1], fmt='g',cmap='BuGn')

fig, ax = plt.subplots(1, 2, figsize=(13, 4))
ax[0].title.set_text("Before SMOTE-ENN")
ax[1].title.set_text("After SMOTE-ENN")
cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot(ax=ax[0])
cm_displayse = ConfusionMatrixDisplay(confusion_matrix = cm_se, display_labels = [False, True])
cm_displayse.plot(ax=ax[1])

plt.show()

### chcek using classification report 

In [None]:
from sklearn.metrics import classification_report
from sklearn import metrics
print(classification_report(yse_test, yse_pred))
#Hasil SVM
print("Akurasi SVM sebelum SMOTE-ENN :", metrics.accuracy_score(y_test, y_pred))
print("Presisi SVM sebelum SMOTE-ENN :", metrics.precision_score(y_test, y_pred))
print("Recall SVM sebelum SMOTE-ENN :", metrics.recall_score(y_test, y_pred), "\n")

print("Akurasi SVM sesudah SMOTE-ENN :", metrics.accuracy_score(yse_test, yse_pred))
print("Presisi SVM sesudah SMOTE-ENN :", metrics.precision_score(yse_test, yse_pred))
print("Recall SVM sesudah SMOTE-ENN :", metrics.recall_score(yse_test, yse_pred))
# print(y_pred)

# Hyperparamter Tunning Using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
import multiprocessing

parameters = {
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'C': [0.5, 1, 10, 100],
    'gamma': ['scale', 0.001],
}

cores = multiprocessing.cpu_count()


grid_search = GridSearchCV(estimator=SVC(random_state=0),
                           param_grid=parameters,
                           n_jobs= cores,
                           verbose=10,
                           scoring='accuracy',
                           cv=5
                           )

# cores
grid_search.fit(xse_train, yse_train.Outcome.ravel())

In [None]:
print(f"Best Score: {grid_search.best_score_}")

best_params = grid_search.best_estimator_.get_params()
print("Best paramters: ")
for param in parameters:
  print(f"\t{param}: {best_params[param]}")

In [None]:
y_pred_grid = grid_search.predict(xse_test)
# print(classification_report(y_test, y_pred_grid))

print(f"Akurasi SVM Sebelum Tunning Hyperparamter: {metrics.accuracy_score(yse_test, yse_pred) * 100}%")
print(f"Akurasi SVM Sesudah Tunning Hyperparamter: {metrics.accuracy_score(yse_test, y_pred_grid) * 100}%")

In [None]:
cm_tun = confusion_matrix(yse_test, y_pred_grid)

fig, ax = plt.subplots(1, 2, figsize=(13, 4))
ax[0].title.set_text("Before Tuning")
ax[1].title.set_text("After Tuning")
cm_display = ConfusionMatrixDisplay(confusion_matrix = cm_se, display_labels = [False, True])
cm_display.plot(ax=ax[0])
cm_displayse = ConfusionMatrixDisplay(confusion_matrix = cm_tun, display_labels = [False, True])
cm_displayse.plot(ax=ax[1])

plt.show()

In [None]:
# save the model to disk
import pickle

filename = 'svm-diabet.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
with open("svm-diabet.pkl", "rb") as r:
  model = pickle.load(r)

yy_pred = model.predict([[150, 0, 0, 0, 0, 0, 0]])

yy_pred
# metrics.accuracy_score(y_test, yy_pred)