# **Implementation of SVM on Pima Indians Diabetes Database**

# **Importing the required libraries**

In [None]:
import numpy as np   # Importing the numpy library for numerical computations
import pandas as pd  # Importing the pandas library for data manipulation and analysis
import matplotlib.pyplot as plt  # Importing the matplotlib library for plotting graphs
import seaborn as sns  # Importing the seaborn library for enhanced data visualization
from sklearn.preprocessing import StandardScaler  # Importing StandardScaler from sklearn.preprocessing module
from sklearn.model_selection import train_test_split  # Importing train_test_split from sklearn.model_selection module
from sklearn.metrics import accuracy_score  # Importing accuracy_score from sklearn.metrics module
from sklearn.svm import SVC  # Importing SVC (Support Vector Classifier) from sklearn.svm module
from sklearn.metrics import confusion_matrix    # Importing the confusion_matrix function for evaluating classification results
from sklearn.metrics import classification_report   # Importing the classification_report function for detailed classification metrics

# **Importing our dataset**

In [None]:
# Reading the CSV file 'diabetes.csv' and storing the data in a DataFrame called 'data'
data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
# Displaying the first 5 rows of the dataset
data.head(5)

In [None]:
# Displaying the last 5 rows of the dataset
data.tail(5)

# **Understanding our data**

In [None]:
# Shape of our data
print("Rows and Columns of the dataset :- ",data.shape)

In [None]:
# Identifying information about composition and potential data quality
data.info()

In [None]:
# Displaying the columns in our dataset
data.columns

### **Columns or Features Description**

- 'Pregnancies': Number of times pregnant
- 'Glucose': Plasma glucose concentration in a 2-hour oral glucose tolerance test
- 'BloodPressure': Diastolic blood pressure (mm Hg)
- 'SkinThickness': Triceps skinfold thickness (mm)
- 'Insulin': 2-Hour serum insulin (mu U/ml)
- 'BMI': Body mass index (weight in kg / (height in meters)^2)
- 'DiabetesPedigreeFunction': Diabetes pedigree function (a measure of the genetic influence)
- 'Age': Age (years)
- 'Outcome': Binary variable indicating whether a person has diabetes or not (0 - No, 1 - Yes)

In [None]:
# To show statistical summary of the columns of our data
data.describe(include ='all')

# **Checking NULL Values**

In [None]:
# To count the null values
data.isnull().sum()

From the output 'data.isnull().sum()' where all the columns show a count of 0, we can infer that there are no missing values (null values) in any of the columns of the DataFrame data.

This is a positive indication as missing values can sometimes lead to issues or inaccuracies in data analysis and modeling. Having a dataset without missing values allows us to work with complete information for each variable, ensuring more reliable and accurate results in our analysis or modeling tasks.

# **Checking distribution of Target Variable**

In [None]:
# Count the occurrences of each unique value in the 'Outcome' column
data['Outcome'].value_counts()

This indicates that in the 'Outcome' column, there are 500 occurrences of the value 0 and 268 occurrences of the value 1. These values likely represent the presence or absence of diabetes, where 0 represents no diabetes and 1 represents the presence of diabetes.

In [None]:
# Creating a countplot to visualize the distribution of 'Outcome' values
sns.countplot(x='Outcome', data=data)

# Adding a label to the x-axis
plt.xlabel('0 -> No diabetes   1 -> Diabetes')

# Adding a label to the y-axis
plt.ylabel('Count')

# Setting the title of the plot
plt.title('Distribution of the target variable')

# Displaying the plot
plt.show()

In [None]:
# Grouping the data by the 'Outcome' column and calculating the mean for each group
data.groupby('Outcome').mean()

# **Performing EDA**

In [None]:
# Creating a histogram to visualize the distribution of the 'Age' variable
plt.hist(data['Age'])

# Adding a label to the x-axis
plt.xlabel('Age')

# Adding a label to the y-axis
plt.ylabel('Frequency')

# Display the histogram plot
plt.show()

In [None]:
# Creating a scatter plot to visualize the relationship between 'BMI' and 'Glucose'
plt.scatter(data['BMI'], data['Glucose'])

# Adding a label to the x-axis
plt.xlabel('BMI')

# Adding a label to the y-axis
plt.ylabel('Glucose')

# Adding a title to the plot
plt.title('Scatter Plot: BMI vs. Glucose')

# Display the scatter plot
plt.show()

In [None]:
# Compute correlation matrix
corr_matrix = data.corr()

# Plot heatmap of correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='YlGnBu')
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# Creating a pair plot to visualize pairwise relationships between variables, with 'Outcome' as the hue
sns.pairplot(data, hue='Outcome')

# Adding a title to the plot
plt.title('Pair Plot')

# Display the pair plot
plt.show()

In [None]:
# Set the figure size for the violin plot
plt.figure(figsize=(8, 6))

# Creating a violin plot to visualize the distribution of 'Glucose' by 'Outcome'
sns.violinplot(x='Outcome', y='Glucose', data=data)

# Adding a label to the x-axis
plt.xlabel('Outcome')

# Adding a label to the y-axis
plt.ylabel('Glucose')

# Adding a title to the plot
plt.title('Violin Plot: Glucose by Outcome')

# Display the violin plot
plt.show()

In [None]:
# Setting the figure size for the count plot
plt.figure(figsize=(8, 6))

# Creating a count plot with hue to visualize 'Pregnancies' counts by 'Outcome'
sns.countplot(x='Outcome', hue='Pregnancies', data=data)

# Adding a label to the x-axis
plt.xlabel('Outcome')

# Adding a label to the y-axis
plt.ylabel('Count')

# Adding a title to the plot
plt.title('Bar Plot: Pregnancies Counts by Outcome')

# Display the count plot
plt.show()

# **Splitting our dataset**

In [None]:
# Creating the feature variables by dropping the 'Outcome' column
X = data.drop(columns='Outcome', axis=1)

# Creating the target variable
Y = data['Outcome']

In [None]:
# Displaying our feature variable
X

In [None]:
# Displaying our target variable
Y

# **Data Standardization**

In [None]:
# Create an instance of the StandardScaler
scaler = StandardScaler()

In [None]:
# Fitting the StandardScaler to the feature variables (X)
scaler.fit(X)

In [None]:
# Transform the feature variables (X) using the fitted StandardScaler
standardized_data = scaler.transform(X)
standardized_data

In [None]:
# Assigning the standardized feature variables to X
X = standardized_data

# Assigning the 'Outcome' column from the diabetes_dataset DataFrame to Y
Y = data['Outcome']

In [None]:
# Displaying our feature variable after scaling
X

In [None]:
# Displaying our target variable
Y

# **Train Test Split**

In [None]:
# Split the data into training and testing sets
# X_train: training feature variables
# X_test: testing feature variables
# Y_train: training target variable
# Y_test: testing target variable
# The data is split using a test size of 0.2 (20% of the data) and a random state of 2 for reproducibility

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
# Print the shapes of X, X_train, and X_test
print("Shape of X:", X.shape)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

# **Training our Model**

In [None]:
# Creating an instance of the Support Vector Classifier (SVC)
# The kernel parameter is set to 'rbf' indicating a radial basis function kernel
# The random_state parameter is set to 0 for reproducibility
SVM = SVC(kernel='rbf', random_state=0)

In [None]:
# Fitting the SVM model to the training data
SVM.fit(X_train, Y_train)

# **Model Evaluation**

## **Accuracy Score of Training data**

In [None]:
# Calculate the accuracy score of the model on the training data
training_data_accuracy = SVM.score(X_train, Y_train)

# Print the accuracy score of the training data
print('Accuracy score of the training data:', training_data_accuracy)

## **Accuracy Score of Testing data**

In [None]:
# Calculate the accuracy score of the model on the testing data
testing_data_accuracy = SVM.score(X_test, Y_test)

# Print the accuracy score of the testing data
print('Accuracy score of the testing data:', testing_data_accuracy)

# **Confusion Matrix**

In [None]:
# Predicting the target variable for the testing data
y_predict = SVM.predict(X_test)

# Computing the confusion matrix
confusion_matrix(Y_test,y_predict)

# Creating a cross-tabulation table
pd.crosstab(Y_test, y_predict, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
# Computing the confusion matrix
cnf_matrix = confusion_matrix(Y_test, y_predict)

# Creating a heatmap of the confusion matrix
plt.figure(figsize=(8, 6))
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')

# Setting the title of the heatmap
plt.title('Confusion Matrix', y=1.1)

# Setting the label for the y-axis
plt.ylabel('Actual label')

# Setting the label for the x-axis
plt.xlabel('Predicted label')

# Displaying the heatmap
plt.show()

# **Classification Report**

In [None]:
print(classification_report(Y_test,y_predict))

# **Predictive System**

In [None]:
# Defining the input data
input_data = (5, 116, 74, 0, 175, 25.8, 0.587, 51)

# Convert the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Standardize the input data
std_data = scaler.transform(input_data_reshaped)
print("Standardized input data:", std_data)

# Make the prediction using the SVM model
prediction = SVM.predict(std_data)
print("Prediction:", prediction)

# Print the result based on the prediction
if prediction[0] == 0:
    print('The person is not diabetic')
else:
    print('The person is diabetic')