<a href="https://colab.research.google.com/github/osrswati/Predicting-Diabetes-using-Python/blob/main/Data_Processing_for_Prediction_Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation

## Importing Libraries

In [None]:
# Importing necessary libraries
import pandas as pd                                   # Library used for working with data sets and perform data analysis.
import numpy as np                                    # To perform mathematical operations and statistics
from sklearn.preprocessing import StandardScaler      # to scale different attributes
from sklearn.model_selection import train_test_split  # for splitting the dataset into training and testing sets
from sklearn.metrics import accuracy_score            # for evaluating model performance
from sklearn.svm import SVC                           # for Support Vector Machine algorithm
import warnings # Disable specific warning category

In [None]:
# Version libraries

pd.show_versions(as_json=False)


INSTALLED VERSIONS
------------------
commit           : 2e218d10984e9919f0296931d92ea851c6a6faf5
python           : 3.10.12.final.0
python-bits      : 64
OS               : Linux
OS-release       : 6.1.58+
Version          : #1 SMP PREEMPT_DYNAMIC Sat Nov 18 15:31:17 UTC 2023
machine          : x86_64
processor        : x86_64
byteorder        : little
LC_ALL           : en_US.UTF-8
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 1.5.3
numpy            : 1.25.2
pytz             : 2023.4
dateutil         : 2.8.2
setuptools       : 67.7.2
pip              : 23.1.2
Cython           : 3.0.9
pytest           : 7.4.4
hypothesis       : None
sphinx           : 5.0.2
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.9.4
html5lib         : 1.1
pymysql          : None
psycopg2         : 2.9.9
jinja2           : 3.1.3
IPython          : 7.34.0
pandas_datareader: 0.10.0
bs4              : 4.12.3
bottleneck       : None


## Load Dataset

Since this is a CSV data stored in the Google Drive, we will download it using `gdown`

In [None]:
# Let's download the data files using gdown
url = "https://drive.google.com/file/d/1K3Mo9C7xCVjSgghdRJoy806_HR86q8Qs/view?usp=sharing"
output = "diabetes_dataset.csv"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1K3Mo9C7xCVjSgghdRJoy806_HR86q8Qs
To: /content/diabetes_dataset.csv
100%|██████████| 23.9k/23.9k [00:00<00:00, 38.4MB/s]


'diabetes_dataset.csv'

In [None]:
df = pd.read_csv('/content/diabetes_dataset.csv')

df.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


## Data Dictionary

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


The Pima Indians Diabetes Database contains:

* **Pregnancies**: Number of times pregnant.
* **Glucose**: Plasma glucose concentration a 2 hours in an oral glucose tolerance test.
* **BloodPressure**: Diastolic blood pressure (mm Hg).
* **SkinThickness**: Triceps skin fold thickness (mm).
* **Insulin**: 2-Hour serum insulin (mu U/ml).
* **BMI**: Body mass index (weight in kg/(height in m)^2).
* **DiabetesPedigreeFunction**: Diabetes pedigree function (a function which scores likelihood of diabetes based on family history).
* **Age**: Age in years.
* **Outcome**: Class variable (0 or 1) indicating whether the individual has diabetes (1) or not (0).


In [None]:
columns = df.shape[0]
rows = df.shape[1]

print(f'Total colums: {columns} \nTotal rows: {rows}')

Total colums: 768 
Total rows: 9


# Extract Label and Attributes

In [None]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
df['Outcome'].value_counts(normalize = True)

0    0.651042
1    0.348958
Name: Outcome, dtype: float64

In [None]:
#Copy original dataset

df_copy = df.copy()

In [None]:
#Drop label for predictions

df_attributes = df_copy.drop(columns='Outcome', axis = 1)

df_attributes.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32


In [None]:
#Label for predisctions

df_label = df_copy['Outcome']

df_label.head(3)

0    1
1    0
2    1
Name: Outcome, dtype: int64

# Data Preprocessing (Standard Scaler)

In [None]:
# Define Standard Scale dataset

columns_attributes = df_attributes.columns.to_list()

df_standarized = df_attributes.copy()

In [None]:
# using StandardScaler Scaler
ss_scaler = StandardScaler()

df_standarized[columns_attributes] = ss_scaler.fit_transform(df_standarized[columns_attributes])

df_standarized

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


# Splitting Dataset (Train and Test)

In [None]:
features_train = df_standarized

target_train = df_label

In [None]:
features_train, features_test, target_train, target_test = train_test_split(features_train, target_train, test_size=0.3, random_state=11)

In [None]:
rows_columns_original = df_attributes.shape
rows_columns_train = features_train.shape
rows_columns_test = features_test.shape

print(f'''
  Total columns and rows dataset: {rows_columns_original}
  Total colums and rows train: {rows_columns_train}
  Total colums and rows test: {rows_columns_test}
''')


  Total columns and rows dataset: (768, 8)
  Total colums and rows train: (537, 8)
  Total colums and rows test: (231, 8)



# Build SVM Model

In [None]:
svm_model = SVC(kernel='linear')

train_model = svm_model.fit(features_train, target_train)

In [None]:
#Testing accuracy model training

train_prediction = train_model.predict(features_train)

data_accuracy_train = accuracy_score(train_prediction, target_train)

print(f'Accuracy data training is {data_accuracy_train * 100:.2f}%')

Accuracy data training is 78.40%


In [None]:
#Testing accuracy model testing

test_prediction = train_model.predict(features_test)

data_accuracy_test = accuracy_score(test_prediction, target_test)

print(f'Accuracy data test is {data_accuracy_test * 100:.2f}%')

Accuracy data test is 77.49%


# Testing Model

In [None]:
#original dataset
filter_glucose = df['Glucose'].isin([89,183,126,93])
filter_insulin = df['Insulin'] <= 30

df[filter_glucose & filter_insulin].head(4)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
2,8,183,64,0,0,23.3,0.672,32,1
72,13,126,90,0,0,43.4,0.583,42,1
491,2,89,90,30,0,33.5,0.292,42,0
585,1,93,56,11,0,22.5,0.417,22,0


In [None]:
# index,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
# 2,8,183,64,0,0,23.3,0.672,32,1
# 72,13,126,90,0,0,43.4,0.583,42,1
# 491,2,89,90,30,0,33.5,0.292,42,0
# 585,1,93,56,11,0,22.5,0.417,22,0

In [None]:
input_data = (6, 148, 72, 35, 0, 33.6, 0.627, 50) #attributes values outcome = 1, have diabetes

input_data_convert = np.array(input_data)

input_data_reshape = input_data_convert.reshape(1,-1)

input_data_standarized = ss_scaler.transform(input_data_reshape)

input_data_standarized





array([[ 0.63994726,  0.84832379,  0.14964075,  0.90726993, -0.69289057,
         0.20401277,  0.46849198,  1.4259954 ]])

In [None]:
prediction = svm_model.predict(input_data_standarized)

warnings.filterwarnings("ignore", category=UserWarning) # Disable specific warning category

In [None]:
# Code for model prediction

if prediction[0] == 1: #value testing diabetes
  print('Your prediction is right')
else:
  print('Your prediction is not yet accurate')


Your prediction is right


# Save your model

In [None]:
import pickle
from google.colab import files

In [None]:
file_import = 'diabetes_model.sav'

pickle.dump(svm_model, open(file_import, 'wb'))

In [None]:
files.download('diabetes_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('diabetes_model.sav')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>