In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report, roc_curve, auc
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix


%matplotlib inline

In [2]:
df= pd.read_csv('diabetes.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.describe().round(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.85,120.89,69.11,20.54,79.8,31.99,0.47,33.24,0.35
std,3.37,31.97,19.36,15.95,115.24,7.88,0.33,11.76,0.48
min,0.0,0.0,0.0,0.0,0.0,0.0,0.08,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.37,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.63,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [6]:
x= df.iloc[:,:-1]

In [7]:
y= df.Outcome

# Logistic Regression on all dataset

In [8]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2, random_state= 123)

In [9]:
model= LogisticRegression(solver='liblinear') # small dataset

In [10]:
model.fit(x_train,y_train)

LogisticRegression(solver='liblinear')

In [11]:
print(classification_report(y_train,model.predict(x_train),target_names= ['negative','positive']))

              precision    recall  f1-score   support

    negative       0.78      0.90      0.84       404
    positive       0.73      0.51      0.60       210

    accuracy                           0.77       614
   macro avg       0.76      0.71      0.72       614
weighted avg       0.76      0.77      0.76       614



In [12]:
print(classification_report(y_test,model.predict(x_test),target_names= ['negative','positive']))

              precision    recall  f1-score   support

    negative       0.81      0.94      0.87        96
    positive       0.86      0.64      0.73        58

    accuracy                           0.82       154
   macro avg       0.84      0.79      0.80       154
weighted avg       0.83      0.82      0.82       154



# Select important feature

**Univariate Selection**

In [13]:
bestfeatures= SelectKBest(score_func=chi2,k='all')
fit= bestfeatures.fit(x,y)

In [14]:
fit.scores_

array([ 111.51969064, 1411.88704064,   17.60537322,   53.10803984,
       2175.56527292,  127.66934333,    5.39268155,  181.30368904])

In [15]:
x.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [16]:
result= pd.DataFrame([x.columns,fit.scores_]).T
result.columns=['specs','Score']

In [17]:
result.sort_values('Score',ascending=False)

Unnamed: 0,specs,Score
4,Insulin,2175.565273
1,Glucose,1411.887041
7,Age,181.303689
5,BMI,127.669343
0,Pregnancies,111.519691
3,SkinThickness,53.10804
2,BloodPressure,17.605373
6,DiabetesPedigreeFunction,5.392682


**Feature selection**

In [18]:
from sklearn.ensemble import ExtraTreesClassifier

In [19]:
etc= ExtraTreesClassifier()
etc.fit(x,y)

ExtraTreesClassifier()

In [20]:
etc.feature_importances_

array([0.11157175, 0.23594933, 0.09969197, 0.08008522, 0.07254068,
       0.13697507, 0.11790014, 0.14528584])

In [21]:
x.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [22]:
result2= pd.DataFrame([x.columns,etc.feature_importances_]).T
result2.columns=['specs','Score']

In [23]:
result2.sort_values('Score',ascending=False)

Unnamed: 0,specs,Score
1,Glucose,0.235949
7,Age,0.145286
5,BMI,0.136975
6,DiabetesPedigreeFunction,0.1179
0,Pregnancies,0.111572
2,BloodPressure,0.099692
3,SkinThickness,0.080085
4,Insulin,0.072541
