# Loading required Libraries

In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkFiles

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pymongo
from pymongo import MongoClient
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Building SparkSession

In [2]:
spark = SparkSession \
    .builder \
    .appName("diabetes-disease-prediction") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.coll") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.coll") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1').getOrCreate()

# Dataset Fetching

### First Dataset Fetching

In [3]:
url= "https://raw.githubusercontent.com/padwalvinay291/Diabetes-Dataset/main/Dataset%20of%20Diabetes.csv"
spark.sparkContext.addFile(url)
SparkFiles.get("Dataset of Diabetes.csv")
df1=spark.read.csv("file:///"+SparkFiles.get("Dataset of Diabetes.csv"), header=True, inferSchema= True)
df1.show()

+---+---------+------+---+----+---+-----+----+---+---+---+----+----+-----+
| ID|No_Pation|Gender|AGE|Urea| Cr|HbA1c|Chol| TG|HDL|LDL|VLDL| BMI|CLASS|
+---+---------+------+---+----+---+-----+----+---+---+---+----+----+-----+
|502|    17975|     F| 50| 4.7| 46|  4.9| 4.2|0.9|2.4|1.4| 0.5|24.0|    N|
|735|    34221|     M| 26| 4.5| 62|  4.9| 3.7|1.4|1.1|2.1| 0.6|23.0|    N|
|420|    47975|     F| 50| 4.7| 46|  4.9| 4.2|0.9|2.4|1.4| 0.5|24.0|    N|
|680|    87656|     F| 50| 4.7| 46|  4.9| 4.2|0.9|2.4|1.4| 0.5|24.0|    N|
|504|    34223|     M| 33| 7.1| 46|  4.9| 4.9|1.0|0.8|2.0| 0.4|21.0|    N|
|634|    34224|     F| 45| 2.3| 24|  4.0| 2.9|1.0|1.0|1.5| 0.4|21.0|    N|
|721|    34225|     F| 50| 2.0| 50|  4.0| 3.6|1.3|0.9|2.1| 0.6|24.0|    N|
|421|    34227|     M| 48| 4.7| 47|  4.0| 2.9|0.8|0.9|1.6| 0.4|24.0|    N|
|670|    34229|     M| 43| 2.6| 67|  4.0| 3.8|0.9|2.4|3.7| 1.0|21.0|    N|
|759|    34230|     F| 32| 3.6| 28|  4.0| 3.8|2.0|2.4|3.8| 1.0|24.0|    N|
|636|    34231|     F| 31

### Schema of the Dataframe :

In [4]:
df1.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- No_Pation: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- Urea: double (nullable = true)
 |-- Cr: integer (nullable = true)
 |-- HbA1c: double (nullable = true)
 |-- Chol: double (nullable = true)
 |-- TG: double (nullable = true)
 |-- HDL: double (nullable = true)
 |-- LDL: double (nullable = true)
 |-- VLDL: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- CLASS: string (nullable = true)



### Second Dataset fetching

In [5]:
url= "https://raw.githubusercontent.com/padwalvinay291/Diabetes-Dataset/main/Diabetes2.csv"
spark.sparkContext.addFile(url)
SparkFiles.get("Diabetes2.csv")
df2=spark.read.csv("file:///"+SparkFiles.get("Diabetes2.csv"), header=True, inferSchema= True)
df2.show()

+--------------+-----------+-------+--------+--------------+---+------+------+------+----+-----------+------------+-----+---+---------------+-----------+----+----+
|Patient number|Cholesterol|Glucose|HDL Chol|Chol/HDL ratio|Age|Gender|Height|Weight| BMI|Systolic BP|Diastolic BP|waist|hip|Waist/hip ratio|   Diabetes|_c16|_c17|
+--------------+-----------+-------+--------+--------------+---+------+------+------+----+-----------+------------+-----+---+---------------+-----------+----+----+
|             1|        193|     77|      49|           3.9| 19|female|    61|   119|22.5|        118|          70|   32| 38|           0.84|No diabetes|   6|   6|
|             2|        146|     79|      41|           3.6| 19|female|    60|   135|26.4|        108|          58|   33| 40|           0.83|No diabetes|null|null|
|             3|        217|     75|      54|           4.0| 20|female|    67|   187|29.3|        110|          72|   40| 45|           0.89|No diabetes|null|null|
|             4|

### Schema of the Dataframe :

In [6]:
df2.printSchema()

root
 |-- Patient number: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- HDL Chol: integer (nullable = true)
 |-- Chol/HDL ratio: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Systolic BP: integer (nullable = true)
 |-- Diastolic BP: integer (nullable = true)
 |-- waist: integer (nullable = true)
 |-- hip: integer (nullable = true)
 |-- Waist/hip ratio: double (nullable = true)
 |-- Diabetes: string (nullable = true)
 |-- _c16: integer (nullable = true)
 |-- _c17: integer (nullable = true)



In [7]:
import pyspark.sql.functions as func

In [8]:
dnew=df1.withColumn("Cholesterol",  func.round(df1.Chol*18,0).cast('integer')).withColumnRenamed("AGE","Age").withColumnRenamed("CLASS","Diabetes").withColumn("HDLChol",  func.round(df1.HDL*18,0).cast('integer'))
dnew.show()

+---+---------+------+---+----+---+-----+----+---+---+---+----+----+--------+-----------+-------+
| ID|No_Pation|Gender|Age|Urea| Cr|HbA1c|Chol| TG|HDL|LDL|VLDL| BMI|Diabetes|Cholesterol|HDLChol|
+---+---------+------+---+----+---+-----+----+---+---+---+----+----+--------+-----------+-------+
|502|    17975|     F| 50| 4.7| 46|  4.9| 4.2|0.9|2.4|1.4| 0.5|24.0|       N|         76|     43|
|735|    34221|     M| 26| 4.5| 62|  4.9| 3.7|1.4|1.1|2.1| 0.6|23.0|       N|         67|     20|
|420|    47975|     F| 50| 4.7| 46|  4.9| 4.2|0.9|2.4|1.4| 0.5|24.0|       N|         76|     43|
|680|    87656|     F| 50| 4.7| 46|  4.9| 4.2|0.9|2.4|1.4| 0.5|24.0|       N|         76|     43|
|504|    34223|     M| 33| 7.1| 46|  4.9| 4.9|1.0|0.8|2.0| 0.4|21.0|       N|         88|     14|
|634|    34224|     F| 45| 2.3| 24|  4.0| 2.9|1.0|1.0|1.5| 0.4|21.0|       N|         52|     18|
|721|    34225|     F| 50| 2.0| 50|  4.0| 3.6|1.3|0.9|2.1| 0.6|24.0|       N|         65|     16|
|421|    34227|     

In [9]:
d1new=dnew.withColumnRenamed("DOB","DateOfBirth").withColumn("Chol/HDL ratio",func.round((dnew.Cholesterol/dnew.HDLChol),1)).withColumn("Glucose", func.round(((dnew.HDLChol*1.98)-4.29),0).cast('integer'))
d1new.show()

+---+---------+------+---+----+---+-----+----+---+---+---+----+----+--------+-----------+-------+--------------+-------+
| ID|No_Pation|Gender|Age|Urea| Cr|HbA1c|Chol| TG|HDL|LDL|VLDL| BMI|Diabetes|Cholesterol|HDLChol|Chol/HDL ratio|Glucose|
+---+---------+------+---+----+---+-----+----+---+---+---+----+----+--------+-----------+-------+--------------+-------+
|502|    17975|     F| 50| 4.7| 46|  4.9| 4.2|0.9|2.4|1.4| 0.5|24.0|       N|         76|     43|           1.8|     81|
|735|    34221|     M| 26| 4.5| 62|  4.9| 3.7|1.4|1.1|2.1| 0.6|23.0|       N|         67|     20|           3.4|     35|
|420|    47975|     F| 50| 4.7| 46|  4.9| 4.2|0.9|2.4|1.4| 0.5|24.0|       N|         76|     43|           1.8|     81|
|680|    87656|     F| 50| 4.7| 46|  4.9| 4.2|0.9|2.4|1.4| 0.5|24.0|       N|         76|     43|           1.8|     81|
|504|    34223|     M| 33| 7.1| 46|  4.9| 4.9|1.0|0.8|2.0| 0.4|21.0|       N|         88|     14|           6.3|     23|
|634|    34224|     F| 45| 2.3| 

In [10]:
d1new=d1new.drop("ID","No_Pation","Urea","Cr","HbA1c","Chol","TG","HDL","LDL","VLDL")
d1new.show()

+------+---+----+--------+-----------+-------+--------------+-------+
|Gender|Age| BMI|Diabetes|Cholesterol|HDLChol|Chol/HDL ratio|Glucose|
+------+---+----+--------+-----------+-------+--------------+-------+
|     F| 50|24.0|       N|         76|     43|           1.8|     81|
|     M| 26|23.0|       N|         67|     20|           3.4|     35|
|     F| 50|24.0|       N|         76|     43|           1.8|     81|
|     F| 50|24.0|       N|         76|     43|           1.8|     81|
|     M| 33|21.0|       N|         88|     14|           6.3|     23|
|     F| 45|21.0|       N|         52|     18|           2.9|     31|
|     F| 50|24.0|       N|         65|     16|           4.1|     27|
|     M| 48|24.0|       N|         52|     16|           3.3|     27|
|     M| 43|21.0|       N|         68|     43|           1.6|     81|
|     F| 32|24.0|       N|         68|     43|           1.6|     81|
|     F| 31|23.0|       N|         65|     31|           2.1|     57|
|     F| 33|21.0|   

In [11]:
from pyspark.sql.functions import when

d2new=df2.withColumnRenamed("HDL Chol","HDLChol").withColumn("Gender", when(df2.Gender =="male","M") \
      .when(df2.Gender == "female","F") \
      .otherwise(df2.Gender)) \
.withColumn("Diabetes", when(df2.Diabetes =="No diabetes","N") \
      .when(df2.Diabetes == "Diabetes","Y") \
      .otherwise(df2.Diabetes))

In [12]:
d2new=d2new.drop("Patient number","Height","Weight","Systolic BP","Diastolic BP","waist","hip","Waist/hip ratio","_c16","_c17")
d2new.show()

+-----------+-------+-------+--------------+---+------+----+--------+
|Cholesterol|Glucose|HDLChol|Chol/HDL ratio|Age|Gender| BMI|Diabetes|
+-----------+-------+-------+--------------+---+------+----+--------+
|        193|     77|     49|           3.9| 19|     F|22.5|       N|
|        146|     79|     41|           3.6| 19|     F|26.4|       N|
|        217|     75|     54|           4.0| 20|     F|29.3|       N|
|        226|     97|     70|           3.2| 20|     F|19.6|       N|
|        164|     91|     67|           2.4| 20|     F|20.2|       N|
|        170|     69|     64|           2.7| 20|     F|27.6|       N|
|        149|     77|     49|           3.0| 20|     F|21.0|       N|
|        164|     71|     63|           2.6| 20|     M|19.7|       N|
|        230|    112|     64|           3.6| 20|     M|24.9|       N|
|        179|    105|     60|           3.0| 20|     F|35.5|       N|
|        174|    105|    117|           1.5| 20|     M|26.8|       N|
|        193|    106

In [13]:
d1new.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Diabetes: string (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- HDLChol: integer (nullable = true)
 |-- Chol/HDL ratio: double (nullable = true)
 |-- Glucose: integer (nullable = true)



In [14]:
d2new.printSchema()

root
 |-- Cholesterol: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- HDLChol: integer (nullable = true)
 |-- Chol/HDL ratio: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Diabetes: string (nullable = true)



In [15]:
newdata=d1new.unionByName(d2new)

In [16]:
newdata.show()
newdata.count()

+------+---+----+--------+-----------+-------+--------------+-------+
|Gender|Age| BMI|Diabetes|Cholesterol|HDLChol|Chol/HDL ratio|Glucose|
+------+---+----+--------+-----------+-------+--------------+-------+
|     F| 50|24.0|       N|         76|     43|           1.8|     81|
|     M| 26|23.0|       N|         67|     20|           3.4|     35|
|     F| 50|24.0|       N|         76|     43|           1.8|     81|
|     F| 50|24.0|       N|         76|     43|           1.8|     81|
|     M| 33|21.0|       N|         88|     14|           6.3|     23|
|     F| 45|21.0|       N|         52|     18|           2.9|     31|
|     F| 50|24.0|       N|         65|     16|           4.1|     27|
|     M| 48|24.0|       N|         52|     16|           3.3|     27|
|     M| 43|21.0|       N|         68|     43|           1.6|     81|
|     F| 32|24.0|       N|         68|     43|           1.6|     81|
|     F| 31|23.0|       N|         65|     31|           2.1|     57|
|     F| 33|21.0|   

1390

In [17]:
newdata.select('Diabetes').distinct().collect()


[Row(Diabetes='Y'), Row(Diabetes='N'), Row(Diabetes='P')]

In [18]:
newdata.select('Gender').distinct().collect()

[Row(Gender='F'), Row(Gender='M')]

In [19]:
newdata=newdata.where(newdata.Diabetes != 'P')
newdata.show()

+------+---+----+--------+-----------+-------+--------------+-------+
|Gender|Age| BMI|Diabetes|Cholesterol|HDLChol|Chol/HDL ratio|Glucose|
+------+---+----+--------+-----------+-------+--------------+-------+
|     F| 50|24.0|       N|         76|     43|           1.8|     81|
|     M| 26|23.0|       N|         67|     20|           3.4|     35|
|     F| 50|24.0|       N|         76|     43|           1.8|     81|
|     F| 50|24.0|       N|         76|     43|           1.8|     81|
|     M| 33|21.0|       N|         88|     14|           6.3|     23|
|     F| 45|21.0|       N|         52|     18|           2.9|     31|
|     F| 50|24.0|       N|         65|     16|           4.1|     27|
|     M| 48|24.0|       N|         52|     16|           3.3|     27|
|     M| 43|21.0|       N|         68|     43|           1.6|     81|
|     F| 32|24.0|       N|         68|     43|           1.6|     81|
|     F| 31|23.0|       N|         65|     31|           2.1|     57|
|     F| 33|21.0|   

In [20]:
newdata.select('Diabetes').distinct().collect()

[Row(Diabetes='Y'), Row(Diabetes='N')]

# Dumping the Data into MongoDB

In [21]:
newdata.write.format("mongo").mode("overwrite").option("database",
"project").option("collection", "diabetes").save()

# Reading the Data from MongoDB

In [22]:
import pymongo
client = pymongo.MongoClient("mongodb://localhost:27017/")
 
# Database Name
db = client["project"]
 
# Collection Name
diabetes=db['diabetes']

data = pd.DataFrame(list(diabetes.find({},{'_id': 0})))

In [23]:
data.head()

Unnamed: 0,Gender,Age,BMI,Diabetes,Cholesterol,HDLChol,Chol/HDL ratio,Glucose
0,F,19,22.5,N,193,49,3.9,77
1,F,19,26.4,N,146,41,3.6,79
2,F,20,29.3,N,217,54,4.0,75
3,F,20,19.6,N,226,70,3.2,97
4,F,20,20.2,N,164,67,2.4,91


# Checking the Null Values:

In [24]:
chcekNullValues = {col:df1.filter(df1[col].isNull()).count() for col in df1.columns}
print(chcekNullValues)

{'ID': 0, 'No_Pation': 0, 'Gender': 0, 'AGE': 0, 'Urea': 0, 'Cr': 0, 'HbA1c': 0, 'Chol': 0, 'TG': 0, 'HDL': 0, 'LDL': 0, 'VLDL': 0, 'BMI': 0, 'CLASS': 0}


In [25]:
chcekNullValues = {col:df2.filter(df2[col].isNull()).count() for col in df2.columns}
print(chcekNullValues)

{'Patient number': 0, 'Cholesterol': 0, 'Glucose': 0, 'HDL Chol': 0, 'Chol/HDL ratio': 0, 'Age': 0, 'Gender': 0, 'Height': 0, 'Weight': 0, 'BMI': 0, 'Systolic BP': 0, 'Diastolic BP': 0, 'waist': 0, 'hip': 0, 'Waist/hip ratio': 0, 'Diabetes': 0, '_c16': 389, '_c17': 389}


In [26]:
data['Diabetes'].unique()
dia = data.groupby('Diabetes').count()
print(dia)

          Gender  Age  BMI  Cholesterol  HDLChol  Chol/HDL ratio  Glucose
Diabetes                                                                 
N            433  433  433          433      433             433      433
Y            904  904  904          904      904             904      904


# Model Development and Prediction

In [27]:
#Importing dataset
import numpy as np
import matplotlib.pyplot as plt  
import pandas as pd
import seaborn as sns

from warnings import filterwarnings
filterwarnings(action='ignore')
sns.set()

from mlxtend.plotting import plot_decision_regions
import missingno as msno
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn import svm
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Label Encoding

In [28]:
label_encoder = preprocessing.LabelEncoder()
data['Gender']= label_encoder.fit_transform(data['Gender'])
data['Gender'].unique()

array([0, 1])

In [29]:
label_encoder = preprocessing.LabelEncoder()
data['Diabetes']= label_encoder.fit_transform(data['Diabetes'])
data['Diabetes'].unique()

dia_distinct = data.groupby('Diabetes').count()
print(dia_distinct)

          Gender  Age  BMI  Cholesterol  HDLChol  Chol/HDL ratio  Glucose
Diabetes                                                                 
0            433  433  433          433      433             433      433
1            904  904  904          904      904             904      904


In [30]:
data.head()

Unnamed: 0,Gender,Age,BMI,Diabetes,Cholesterol,HDLChol,Chol/HDL ratio,Glucose
0,0,19,22.5,0,193,49,3.9,77
1,0,19,26.4,0,146,41,3.6,79
2,0,20,29.3,0,217,54,4.0,75
3,0,20,19.6,0,226,70,3.2,97
4,0,20,20.2,0,164,67,2.4,91


## Selecting features

In [31]:
X = data.drop('Diabetes',axis=1)
y = data['Diabetes']

## Standardization

In [32]:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X

array([[-1.03424342, -2.77799959, -1.29653982, ...,  1.00011767,
        -0.36306815,  0.39033963],
       [-1.03424342, -2.77799959, -0.58110162, ...,  0.57760173,
        -0.52260653,  0.43269348],
       [-1.03424342, -2.6937261 , -0.04910911, ...,  1.26419013,
        -0.30988869,  0.34798579],
       ...,
       [ 0.96689037, -1.85099122, -0.39765593, ..., -0.42587363,
        -0.62896545, -0.41438344],
       [ 0.96689037, -1.17680331,  2.00548265, ..., -0.05617218,
        -0.68214491, -0.11790652],
       [ 0.96689037,  0.1715725 ,  0.62963995, ..., -0.53150262,
        -0.62896545, -0.49909113]])

## Spliting Data

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=2022,stratify=y)

# Logistic Regression

In [34]:
model = LogisticRegression()
model.fit(X_train,y_train)
print("The Model Intercept : ",model.intercept_)
print("The Model Coefficient : ",model.coef_)
y_pred_prob = model.predict_proba(X_test)[:,1]
print("The ROC AUC Score : ",roc_auc_score(y_test, y_pred_prob))

The Model Intercept :  [-8.53391995]
The Model Coefficient :  [[ 0.01128451  0.08099286  0.18680962 -0.05285602  0.02889522  0.98594317
   0.0191352 ]]
The ROC AUC Score :  0.9309954751131222


In [35]:
model = LogisticRegression()
kfold = StratifiedKFold(n_splits=5,random_state=2022,shuffle=True)
results = cross_val_score(model, X,y,cv=kfold,scoring='roc_auc')
print("The Mean : ",results.mean())

The Mean :  0.9372656421657117


# K Nearest Neighbour 

In [36]:
params = np.arange(1,15,2)
score = []
for i in params:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    y_pred_prob = knn.predict_proba(X_test)[:,1]
    score.append(roc_auc_score(y_test,y_pred_prob))
    
i_max = np.argmax(score)    
print("Best n_neighbors = ", params[i_max])
print("Best Score = ",score[i_max])

Best n_neighbors =  11
Best Score =  0.9658795248868778


# DecisionTree Classifier

In [37]:
DecisionTredtree = eClassifier(random_state=2022,max_depth=3)
dtree.fit(X_train,y_train)

# Plotting a tree
plt.figure(figsize=(35,25))
plot_tree(dtree,feature_names=X_train.columns,
               class_names=['P','N'],
               filled=True,fontsize=10) 

y_pred = dtree.predict(X_test)
y_pred_prob = dtree.predict_proba(X_test)[:,1]
print("The ROC AUC Score : ",roc_auc_score(y_test,y_pred_prob))

NameError: name 'eClassifier' is not defined

### ROC Curve

In [None]:
y_pred_prob = dtree.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_prob)
auc = roc_auc_score(y_test, y_pred_prob)
plt.plot(fpr,tpr,label="Data, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
#with grid search Cv

from sklearn.model_selection import StratifiedKFold, GridSearchCV
params = {'max_depth':[None,7,3],
          'min_samples_split':[2,5,10],
          'min_samples_leaf':[1,5,10]}

dtree = DecisionTreeClassifier(random_state=2022)
kfold = StratifiedKFold(n_splits=5,random_state=2022,shuffle=True)
gcv = GridSearchCV(dtree,param_grid=params,scoring='roc_auc',cv=kfold)
gcv.fit(X,y)

pd_gcv = pd.DataFrame(gcv.cv_results_)
print("The Best parameters : ",gcv.best_params_)
print("The Best score : ",gcv.best_score_)

## Visualizing Correlation using Heatmap

In [None]:
plt.figure(figsize=(12,10))
# seaborn has an easy method to showcase heatmap
p = sns.heatmap(data.corr(), annot=True,cmap ='RdYlGn')

### Confusion Metrics

In [None]:
cnf_metric=confusion_matrix(y_test, y_pred)
print("The Confusion Metrix is as follows : ")
print(cnf_metric)

### Classification Report

In [None]:
print("The Classification Report is as follows : ")
print(metrics.classification_report(y_test,y_pred))

# SVC

In [None]:
svm = SVC(kernel="linear",probability=True)
svm.fit(X_train,y_train)
y_pred_prob = svm.predict_proba(X_test)[:,1]
print("The ROC AUC Score : ",roc_auc_score(y_test, y_pred_prob))

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=2022)
params = {'learning_rate':[0.001, 0.01, 0.2, 0.3, 0.5, 0.6],
          'n_estimators':[50,100,150],
          'max_depth':[2,3,4,5]}
kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=2022)
gcv = GridSearchCV(clf,param_grid=params,cv=kfold, scoring='roc_auc')
gcv.fit(X,y)

pd_cv = pd.DataFrame(gcv.cv_results_)
print("The Best Parameters : ",gcv.best_params_)
print("The Best Score : ",gcv.best_score_)

best_model = gcv.best_estimator_
importances=best_model.feature_importances_
######### Variable Importance Plot ################
plt.bar(X.columns,importances)
plt.title("Feature Importance Plot")
plt.xticks(rotation=90)
plt.show()

# XGBClassifier Algorithm

In [None]:
xg_clf = XGBClassifier(random_state=2022)
params = {'learning_rate':[0.001, 0.01, 0.2, 0.3, 0.5, 0.6],
          'n_estimators':[50,100,150],
          'max_depth':[2,3,4,5]}
kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=2022)
gcv = GridSearchCV(xg_clf,param_grid=params,cv=kfold, scoring='roc_auc')
gcv.fit(X,y)

pd_cv = pd.DataFrame(gcv.cv_results_)
print("The Best Parameters are : ",gcv.best_params_)
print("The Best score is : ",gcv.best_score_)

best_model = gcv.best_estimator_
importances=best_model.feature_importances_
######### Variable Importance Plot ################
plt.bar(X.columns,importances)
plt.title("Feature Importance Plot")
plt.xticks(rotation=90)
plt.show()

In [None]:
xg = XGBClassifier(random_state=2022)
xg.fit(X,y)

newdata={0:[0,85,45.0,120,59,3.9,78]}
d=pd.DataFrame(newdata)
d=d.T


res=xg.predict(d)
if res==1:
    print("Diabetec")
else:
    print("Non Diabetec")

In [None]:
rf =  RandomForestClassifier(random_state=2022)
rf.fit(X,y)

newdata={0:[0,85,45.0,120,59,3.9,78]}
d=pd.DataFrame(newdata)
d=d.T


res=rf.predict(d)
if res==1:
    print("Diabetec")
else:
    print("No Diabetes")

# Random Forest

In [None]:
clf = RandomForestClassifier(random_state=2022)
clf.fit(X_train,y_train)
y_pred_prob = clf.predict_proba(X_test)[:,1]
print("The ROC AUC Score : ",roc_auc_score(y_test,y_pred_prob))

In [None]:
#################### Grid Search CV ##########################
params = {'max_features':[2,3,4,5,6,7,8]}
clf = RandomForestClassifier(random_state=2022)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
gcv = GridSearchCV(clf,param_grid=params,cv=kfold,scoring='roc_auc')
gcv.fit(X,y)
pd_cv = pd.DataFrame(gcv.cv_results_)
print("The Best Parameters : ",gcv.best_params_)
print("The Best Score : ",gcv.best_score_)

best_model = gcv.best_estimator_
importances=best_model.feature_importances_
######### Variable Importance Plot ################
plt.bar(X.columns,importances)
plt.title("Feature Importance Plot")
plt.xticks(rotation=90)
plt.show()

# Testing the code

In [None]:
xg = XGBClassifier(random_state=2022)
xg.fit(X,y)

newdata={0:[0,85,45.0,120,59,3.9,78]}
d=pd.DataFrame(newdata)
d=d.T


res=xg.predict(d)
if res==1:
    print("Diabetec")
else:
    print("Non Diabetec")

# Flask Code

In [None]:
import pickle

pickle.dump(xg,open("xgmodel.pkl","wb"))
loaded_model= pickle.load(open("xgmodel.pkl", "rb"))
loaded_model.predict(X_test)
loaded_model.score(X_test,y_test)