# Project Name : Diabetes Prediction Using Machine Learning

#                                                                                                          By Arsh Mishra

![](img.png)

# Loading All The Libraries And Dataset

In [1]:
#importing all the required libraries 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#loading the dataset
dataset = pd.read_csv('datasets/diabetes.csv')

In [3]:
# showing the first 5 rows
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Feature Engineering

In [4]:
# renaming the column names 
dataset.rename(columns ={'Pregnancies':'Preg','BloodPressure':'BP','SkinThickness':'ST','DiabetesPedigreeFunction':'DPF'},inplace=True)

In [5]:
#checking wether the columns are renamed 
dataset.head()

Unnamed: 0,Preg,Glucose,BP,ST,Insulin,BMI,DPF,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# Now checking for the null values

dataset.isnull().sum()

Preg       0
Glucose    0
BP         0
ST         0
Insulin    0
BMI        0
DPF        0
Age        0
Outcome    0
dtype: int64

In [7]:
# since all of them are 0s so there are no Null Values present

In [8]:
# The number of classes present in the Outcome variables are :
dataset.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [9]:
# So we have 500 0s and 268 1s -> which is the case of the imbalanced dataset

In [10]:
# 1 -> diabetes

# 0 -> non diabetic

In [11]:
# checking for the 0 values such that if we have a lot number of 0s then it will impact the model accuracy

In [None]:
# taking all the rows and columns(except the Pregnancies and Outcome)
dataset.iloc[:,1:-1].hist(bins=25,figsize=(20,15))
plt.show()

In [None]:
# so from the above histograms we can analyze

# There are a lot of 0s present
# glucose , bmi , bp , insuline , st

In [None]:
# now we will see in terms of values
dataset.iloc[:,1:-1].isin([0]).sum()

In [None]:
# displaying the statitical information
dataset.iloc[:,1:-3].describe()

In [None]:
# since mean and median values are approximately similar so we can replace the empty values with the median value

In [None]:
for col in ['Glucose','BP','ST','Insulin','BMI']:
    dataset[col] = dataset[col].replace({0:dataset[col].median()})

In [None]:
# now we will see in terms of values
dataset.iloc[:,1:-1].isin([0]).sum()

In [None]:
# now no more 0 values are present

In [None]:
dataset.shape

# Feature Selection

In [None]:
# feature selection using random forest
from sklearn.ensemble import RandomForestClassifier

In [None]:
x =dataset.iloc[:,:-1].values # except the Outcome values

y =dataset.Outcome.values #only the Outcome values

In [None]:
x.shape


In [None]:
y.shape

In [None]:
rf = RandomForestClassifier(n_estimators=20,random_state=11)

In [None]:
# so the object is created

In [None]:
# fitting the data in the object
rf.fit(x,y)

In [None]:
feature_scores = rf.feature_importances_

In [None]:
feature_scores

In [None]:
for i,j in zip(dataset.columns[:-1],feature_scores):
    print(i,"<--->",j)

In [None]:
# so we will consider those features whose scores are maximum
dataset_new = dataset[['Glucose','BMI','Age','DPF','Insulin']]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# splitting the data
xtrain,xtest,ytrain,ytest = train_test_split(dataset_new.values,dataset.Outcome.values,test_size=.3,random_state=30)

In [None]:
xtrain.shape

# Scaling The Data

In [None]:
# we will perform feature scaling now 
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
xtrain_transformed = scaler.fit_transform(xtrain)
xtest_transformed  = scaler.transform(xtest)

In [None]:
# creating the logistic regression model
Log_model = LogisticRegression(C=10)

In [None]:
Log_model.fit(xtrain_transformed,ytrain)

In [None]:
train_score = Log_model.score(xtrain_transformed,ytrain)
test_score = Log_model.score(xtest_transformed,ytest)

In [None]:
print("The Training Score is :",train_score)
print("The Testing Score is : ",test_score)

In [None]:
# creating the test cases
dataset.columns

In [None]:
test_case1 = np.array([150,32,35,0.627,0])

In [None]:
test_case1

In [None]:
type(test_case1)

In [None]:
test_case1.shape

In [None]:
test_case1=test_case1.reshape(1,5)

In [None]:
test_case1.shape

In [None]:
type(test_case1)

In [None]:
# converting it into pandas dataframe

In [None]:
test_data = pd.DataFrame(test_case1)

In [None]:
test_data

# Prediction

In [None]:
Log_model.predict(test_data)

In [None]:
def predict_model(feature_list):
    feature_list = feature_list.reshape(1,5)
    feature_list = pd.DataFrame(feature_list)
    output = Log_model.predict(feature_list)
    print("The chances of the person having diabetes are : ",output)
 
    

In [None]:
test_case2 = np.array([140,22,30,0.600,1])
predict_model(test_case2)

In [None]:
test_case3 = np.array([120,11,10,0.200,0])
predict_model(test_case3)