#Data import and Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [15]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

*Replacing 0 values*

In [18]:
df1 = data.loc[data['Outcome'] == 1]
df2 = data.loc[data['Outcome'] == 0]
df1 = df1.replace({'BloodPressure':0}, np.median(df1['BloodPressure']))
df1 = df1.replace({'BMI':0}, np.median(df1['BMI']))
df1 = df1.replace({'Glucose':0}, np.median(df1['Glucose']))
df1 = df1.replace({'Insulin':0}, np.median(df1['Insulin']))
df1 = df1.replace({'SkinThickness':0}, np.median(df1['SkinThickness']))
df2 = df2.replace({'BloodPressure':0}, np.median(df2['BloodPressure']))
df2 = df2.replace({'BMI':0}, np.median(df2['BMI']))
df2 = df2.replace({'Glucose':0}, np.median(df2['Glucose']))
df2 = df2.replace({'Insulin':0}, np.median(df2['Insulin']))
df2 = df2.replace({'SkinThickness':0}, np.median(df2['SkinThickness']))


dataframe = [df1, df2]
data = pd.concat(dataframe)

In [19]:
data.head()
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.677083,72.378906,27.43099,91.783854,32.433919,0.471876,33.240885,0.348958
std,3.369578,30.464161,12.104431,9.32146,108.121136,6.880664,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,0.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,21.0,39.0,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,27.0,39.0,32.05,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


*Balancing Minority Class Data*

In [None]:
 data.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [23]:
pip install imbalanced-learn



In [22]:
from collections import Counter
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X = data.drop(['Outcome'], axis=1)
y = data['Outcome']
counter = Counter(y)
print(counter)
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)


Counter({0: 500, 1: 268})
Counter({1: 500, 0: 500})




#Running 10 fold CV without any filter

In [29]:
from sklearn.model_selection import train_test_split
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score



X = data.drop(['Outcome'], axis=1)
y = data.Outcome
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.5)
cv = KFold(n_splits=10, random_state=1, shuffle=True)
cv1 = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)


from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


SVM = SVC()
RFC = RandomForestClassifier()
XGB = XGBClassifier()
KNN=KNeighborsClassifier()
NB=GaussianNB()

scores = cross_val_score(SVM, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#scores1 = cross_val_score(SVM, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
print('SVM Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
#print('SVM Accuracy Repeated 10 fold: %.3f (%.3f)' % (mean(scores1), std(scores1)))
scores = cross_val_score(RFC, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#scores1 = cross_val_score(RFC, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
print('RFC Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
#print('RFC Accuracy Repeated 10 fold: %.3f (%.3f)' % (mean(scores1), std(scores1)))
scores = cross_val_score(XGB, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#scores1 = cross_val_score(XGB, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
print('XGB Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
#print('XGB Accuracy Repeated 10 fold: %.3f (%.3f)' % (mean(scores1), std(scores1)))
scores = cross_val_score(KNN, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#scores1 = cross_val_score(KNN, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
print('KNN Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))
#print('KNN Accuracy Repeated 10 fold: %.3f (%.3f)' % (mean(scores1), std(scores1)))
scores = cross_val_score(NB, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Naive Bayes Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))



SVM Accuracy 10 fold: 86.8524 (0.034)
RFC Accuracy 10 fold: 89.0721 (0.028)
XGB Accuracy 10 fold: 89.2037 (0.026)
KNN Accuracy 10 fold: 85.6835 (0.029)
Naive Bayes Accuracy 10 fold: 75.5212 (0.025)


#Filter Assembling 

*CHI-SQUARE*

In [30]:
#Chi-Square Test

import scipy.stats as stats
from scipy.stats import chi2

def chi_sqr(test_column,output_column):
    observed_value = pd.crosstab(test_column,output_column)
    val = stats.chi2_contingency(observed_value)
    expected_value=val[3]
    no_of_rows=len(observed_value.iloc[0:2,0])
    no_of_columns=len(observed_value.iloc[0,0:2])
    dgof=(no_of_rows-1)*(no_of_columns-1)
    alpha=0.05
    for o,e in zip(observed_value.values,expected_value):
        chi_2 = sum([(o-e)**2./e])       
    chi2_stat = np.sum(chi_2)
    return chi2_stat
    
df1 = data.drop(['Outcome'], axis=1)
top_n1 =[]
for i in df1.columns.values:
    top_n1.append(chi_sqr(df1[i],data['Outcome']))
    
col_name1 = np.array(X.columns)
a1 = pd.DataFrame(top_n1)
b1 = pd.DataFrame(col_name1)
info1 = pd.concat([a1,b1], axis=1)
info1.columns = ['Score','Features']

top1 = info1.nsmallest(5,'Score')
ft1 = np.array(top1['Features'])
top1
ft1

array(['BloodPressure', 'Age', 'Pregnancies', 'Glucose', 'SkinThickness'],
      dtype=object)

*ReliefF*

In [32]:
pip install skrebate

Collecting skrebate
  Downloading https://files.pythonhosted.org/packages/d3/8a/969e619753c299b4d3943808ef5f7eb6587d3cb78c93dcbcc3e4ce269f89/skrebate-0.61.tar.gz
Building wheels for collected packages: skrebate
  Building wheel for skrebate (setup.py) ... [?25l[?25hdone
  Created wheel for skrebate: filename=skrebate-0.61-cp36-none-any.whl size=29259 sha256=8dcf288fb6b901314b5e8510548a264ddbcc280c6aaad5eb5ef94421bd947528
  Stored in directory: /root/.cache/pip/wheels/ae/d8/ae/9b51d487e9d02219996d6c260255a216ef07d905b0a0b00ce3
Successfully built skrebate
Installing collected packages: skrebate
Successfully installed skrebate-0.61


In [33]:
from sklearn.pipeline import make_pipeline
from skrebate import ReliefF

features, classes = data.drop('Outcome', axis=1).values, data['Outcome'].values
X_train, X_test, y_train, y_test = train_test_split(features, classes)

arr = X_train.astype('float64')
fs = ReliefF()
fs.fit(arr, y_train)

top_n2=[]
names=[]
for feature_name, feature_score in zip(data.drop('Outcome', axis=1).columns, fs.feature_importances_):
    top_n2.append(feature_score)
    names.append(feature_name)
    
col_name3 = np.array(data.columns)
a3 = pd.DataFrame(top_n2)
b3 = pd.DataFrame(names)

info3 = pd.concat([a3,b3], axis=1)
info3.columns = ['Score','Features'] 

top3 = info3.nlargest(5,'Score')
ft3 = np.array(top3['Features'])
ft3

array(['Glucose', 'Age', 'Insulin', 'SkinThickness', 'Pregnancies'],
      dtype=object)

*Information Gain*

In [38]:
from sklearn.feature_selection import mutual_info_classif

res = mutual_info_classif(X,y,discrete_features=True)

col_name2 = np.array(X.columns)

a2 = pd.DataFrame(res)
b2 = pd.DataFrame(col_name2)
info2 = pd.concat([a2,b2], axis=1)
info2.columns = ['Info_Gain','Features']

top2 = info2.nlargest(5,'Info_Gain')
ft2 = np.array(top2['Features'])
ft2
#array(['Insulin', 'DiabetesPedigreeFunction', 'BMI', 'Glucose','SkinThickness'], dtype=object)


array(['Insulin', 'DiabetesPedigreeFunction', 'BMI', 'Glucose',
       'SkinThickness'], dtype=object)

*Union Function*

In [39]:
from functools import reduce
def top_fs_union(n):
    f1 = ft1[:n]
    f2 = ft2[:n]
    f3 = ft3[:n]
    print (reduce(np.union1d, (f1,f2,f3)))
    return(reduce(np.union1d, (f1,f2,f3)))

*Classifier function call with selected top n*

In [40]:
def assemble_run(n,classifier):
  top_fs=top_fs_union(n)
  top_fs=np.append(top_fs,['Outcome'], axis=0)
  dfs = data.loc[:,top_fs]
  X = dfs.drop(['Outcome'], axis=1)
  y = dfs['Outcome']
  model = classifier()
  scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  print('Accuracy 10 fold: %.4f (%.3f)' % (mean(scores)*100, std(scores)))


In [45]:
#RANDOM FOREST CLASSIFIER

#Feature Subset by Union of Best Features from each
RFC1 = assemble_run(2,RandomForestClassifier)
RFC2 = assemble_run(4,RandomForestClassifier)
RFC3 = assemble_run(6,RandomForestClassifier)


['Age' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose' 'Insulin']
Accuracy 10 fold: 88.4193 (0.036)
['Age' 'BMI' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose'
 'Insulin' 'Pregnancies' 'SkinThickness']
Accuracy 10 fold: 89.8530 (0.025)
['Age' 'BMI' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose'
 'Insulin' 'Pregnancies' 'SkinThickness']
Accuracy 10 fold: 89.0721 (0.028)


In [46]:
#XG BOOST CLASSIFIER

#Feature Subset by Union of Best Features from each
XGB1 = assemble_run(2,XGBClassifier)
XGB2 = assemble_run(4,XGBClassifier)
XGB3 = assemble_run(6,XGBClassifier)


['Age' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose' 'Insulin']
Accuracy 10 fold: 89.5916 (0.030)
['Age' 'BMI' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose'
 'Insulin' 'Pregnancies' 'SkinThickness']
Accuracy 10 fold: 89.2037 (0.026)
['Age' 'BMI' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose'
 'Insulin' 'Pregnancies' 'SkinThickness']
Accuracy 10 fold: 89.2037 (0.026)


In [50]:
#KNN CLASSIFIER

#Feature Subset by Union of Best Features from each
KNN1 = assemble_run(2,KNeighborsClassifier)
KNN1 = assemble_run(4,KNeighborsClassifier)
KNN1 = assemble_run(6,KNeighborsClassifier)


['Age' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose' 'Insulin']
Accuracy 10 fold: 86.5977 (0.027)
['Age' 'BMI' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose'
 'Insulin' 'Pregnancies' 'SkinThickness']
Accuracy 10 fold: 85.6835 (0.029)
['Age' 'BMI' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose'
 'Insulin' 'Pregnancies' 'SkinThickness']
Accuracy 10 fold: 85.6835 (0.029)


In [53]:
#Naive Bayes CLASSIFIER
#Feature Subset by Union of 5 Best Features from each
NB1 = assemble_run(2,GaussianNB)
#Feature Subset by Union of 8 Best Features from each
NB2 = assemble_run(4,GaussianNB)
#Feature Subset by Union of 10 Best Features from each
NB3 = assemble_run(6,GaussianNB)

['Age' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose' 'Insulin']
Accuracy 10 fold: 74.7403 (0.027)
['Age' 'BMI' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose'
 'Insulin' 'Pregnancies' 'SkinThickness']
Accuracy 10 fold: 75.5212 (0.025)
['Age' 'BMI' 'BloodPressure' 'DiabetesPedigreeFunction' 'Glucose'
 'Insulin' 'Pregnancies' 'SkinThickness']
Accuracy 10 fold: 75.5212 (0.025)
