In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score



X = data.drop(['Outcome'], axis=1)
y = data.Outcome
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.5)
cv = KFold(n_splits=10, random_state=1, shuffle=True)
cv1 = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)


from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

SVM = SVC()
RFC = RandomForestClassifier()
XGB = XGBClassifier()
KNN=KNeighborsClassifier()
scores = cross_val_score(SVM, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#scores1 = cross_val_score(SVM, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
print('SVM Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
#print('SVM Accuracy Repeated 10 fold: %.3f (%.3f)' % (mean(scores1), std(scores1)))
scores = cross_val_score(RFC, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#scores1 = cross_val_score(RFC, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
print('RFC Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
#print('RFC Accuracy Repeated 10 fold: %.3f (%.3f)' % (mean(scores1), std(scores1)))
scores = cross_val_score(XGB, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#scores1 = cross_val_score(XGB, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
print('XGB Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
#print('XGB Accuracy Repeated 10 fold: %.3f (%.3f)' % (mean(scores1), std(scores1)))
scores = cross_val_score(KNN, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#scores1 = cross_val_score(KNN, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
print('KNN Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
#print('KNN Accuracy Repeated 10 fold: %.3f (%.3f)' % (mean(scores1), std(scores1)))





SVM Accuracy 10 fold: 65.0991 (0.033)
RFC Accuracy 10 fold: 74.3387 (0.064)
XGB Accuracy 10 fold: 73.5629 (0.051)
KNN Accuracy 10 fold: 71.9959 (0.043)


In [None]:
#Chi-Square Test

import scipy.stats as stats
from scipy.stats import chi2

def chi_sqr(test_column,output_column):
    observed_value = pd.crosstab(test_column,output_column)
    val = stats.chi2_contingency(observed_value)
    expected_value=val[3]
    no_of_rows=len(observed_value.iloc[0:2,0])
    no_of_columns=len(observed_value.iloc[0,0:2])
    dgof=(no_of_rows-1)*(no_of_columns-1)
    alpha=0.05
    for o,e in zip(observed_value.values,expected_value):
        chi_2 = sum([(o-e)**2./e])       
    chi2_stat = np.sum(chi_2)
    return chi2_stat
    
df1 = data.drop(['Outcome'], axis=1)
top_n1 =[]
for i in df1.columns.values:
    top_n1.append(chi_sqr(df1[i],data['Outcome']))
    
col_name1 = np.array(X.columns)
a1 = pd.DataFrame(top_n1)
b1 = pd.DataFrame(col_name1)
info1 = pd.concat([a1,b1], axis=1)
info1.columns = ['Score','Features']

top1 = info1.nsmallest(5,'Score')
ft1 = np.array(top1['Features'])
top1
ft1

array(['BloodPressure', 'Age', 'Pregnancies', 'Glucose', 'SkinThickness'],
      dtype=object)

In [None]:
#ReliefF

from sklearn.pipeline import make_pipeline
from skrebate import ReliefF

features, classes = data.drop('Outcome', axis=1).values, data['Outcome'].values
X_train, X_test, y_train, y_test = train_test_split(features, classes)

arr = X_train.astype('float64')
fs = ReliefF()
fs.fit(arr, y_train)

top_n2=[]
names=[]
for feature_name, feature_score in zip(data.drop('Outcome', axis=1).columns, fs.feature_importances_):
    top_n2.append(feature_score)
    names.append(feature_name)
    
col_name3 = np.array(data.columns)
a3 = pd.DataFrame(top_n2)
b3 = pd.DataFrame(names)

info3 = pd.concat([a3,b3], axis=1)
info3.columns = ['Score','Features'] 

top3 = info3.nlargest(5,'Score')
ft3 = np.array(top3['Features'])
ft3

array(['Glucose', 'Age', 'Pregnancies', 'BMI', 'SkinThickness'],
      dtype=object)

In [None]:
#Information Gain/Mutual Information

from sklearn.feature_selection import mutual_info_classif

res = mutual_info_classif(X,y,discrete_features=True)

col_name2 = np.array(X.columns)

a2 = pd.DataFrame(res)
b2 = pd.DataFrame(col_name2)
info2 = pd.concat([a2,b2], axis=1)
info2.columns = ['Info_Gain','Features']

top2 = info2.nlargest(5,'Info_Gain')
ft2 = np.array(top2['Features'])
ft2

array(['DiabetesPedigreeFunction', 'BMI', 'Glucose', 'Insulin', 'Age'],
      dtype=object)

In [None]:
from functools import reduce
def top_fs_union(n):
    f1 = ft1[:n]
    f2 = ft2[:n]
    f3 = ft3[:n]
    return(reduce(np.union1d, (f1,f2,f3)))

In [None]:
def assemble_run(n,classifier):
  top_fs=top_fs_union(n)
  top_fs=np.append(top_fs,['Outcome'], axis=0)
  dfs = data.loc[:,top_fs]
  X = dfs.drop(['Outcome'], axis=1)
  y = dfs['Outcome']
  model = classifier()
  scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  print('Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))


In [None]:
#SUPPORT VECTOR MACHINE

#Feature Subset by Union of 5 Best Features from each
SVM1 = assemble_run(5,SVC)
SVM2 = assemble_run(8,SVC)
SVM3 = assemble_run(10,SVC)


Accuracy 10 fold: 65.0991 (0.033)
Accuracy 10 fold: 65.0991 (0.033)
Accuracy 10 fold: 65.0991 (0.033)


In [None]:
#RANDOM FOREST CLASSIFIER

#Feature Subset by Union of 5 Best Features from each
RFC1 = assemble_run(5,RandomForestClassifier)
RFC2 = assemble_run(8,RandomForestClassifier)
RFC3 = assemble_run(10,RandomForestClassifier)


Accuracy 10 fold: 74.4771 (0.044)
Accuracy 10 fold: 73.9593 (0.055)
Accuracy 10 fold: 74.3455 (0.061)


In [None]:
#XG BOOST CLASSIFIER

#Feature Subset by Union of 5 Best Features from each
XGB1 = assemble_run(5,XGBClassifier)
XGB2 = assemble_run(8,XGBClassifier)
XGB3 = assemble_run(10,XGBClassifier)


Accuracy 10 fold: 72.6504 (0.054)
Accuracy 10 fold: 72.6504 (0.054)
Accuracy 10 fold: 72.6504 (0.054)


In [None]:
#KNN CLASSIFIER

#Feature Subset by Union of 5 Best Features from each
KNN1 = assemble_run(5,KNeighborsClassifier)
KNN1 = assemble_run(5,KNeighborsClassifier)
KNN1 = assemble_run(5,KNeighborsClassifier)


Accuracy 10 fold: 71.9959 (0.043)
Accuracy 10 fold: 71.9959 (0.043)
Accuracy 10 fold: 71.9959 (0.043)


In [None]:
def top_fs_int(n):
    f1 = ft1[:n]
    f2 = ft2[:n]
    f3 = ft3[:n]
    return(reduce(np.intersect1d, (f1,f2,f3)))

In [None]:
def assemble_run(n,classifier):
  top_fs=top_fs_int(n)
  top_fs=np.append(top_fs,['Outcome'], axis=0)
  dfs = data.loc[:,top_fs]
  X = dfs.drop(['Outcome'], axis=1)
  y = dfs['Outcome']
  model = classifier()
  scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  print('Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))


In [None]:
#SUPPORT VECTOR MACHINE

#Feature Subset by Intersection of 5 Best Features from each
SVM1 = assemble_run(5,SVC)
SVM2 = assemble_run(8,SVC)
SVM3 = assemble_run(10,SVC)


Accuracy 10 fold: 63.2673 (0.047)
Accuracy 10 fold: 63.2673 (0.047)
Accuracy 10 fold: 63.2673 (0.047)


In [None]:
#RANDOM FOREST CLASSIFIER

#Feature Subset by Intersection of 5 Best Features from each
RFC1 = assemble_run(5,RandomForestClassifier)
RFC2 = assemble_run(8,RandomForestClassifier)
RFC3 = assemble_run(10,RandomForestClassifier)


Accuracy 10 fold: 68.8807 (0.044)
Accuracy 10 fold: 68.3612 (0.038)
Accuracy 10 fold: 68.7423 (0.046)


In [None]:
#XG BOOST CLASSIFIER

#Feature Subset by Intersection of 5 Best Features from each
XGB1 = assemble_run(5,XGBClassifier)
XGB2 = assemble_run(8,XGBClassifier)
XGB3 = assemble_run(10,XGBClassifier)


Accuracy 10 fold: 69.7915 (0.037)
Accuracy 10 fold: 69.7915 (0.037)
Accuracy 10 fold: 69.7915 (0.037)


In [None]:
#KNN CLASSIFIER

#Feature Subset by Intersection of 5 Best Features from each
KNN1 = assemble_run(5,KNeighborsClassifier)
KNN1 = assemble_run(5,KNeighborsClassifier)
KNN1 = assemble_run(5,KNeighborsClassifier)


Accuracy 10 fold: 70.9484 (0.058)
Accuracy 10 fold: 70.9484 (0.058)
Accuracy 10 fold: 70.9484 (0.058)
