In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

#Import models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
file_path =  'diabetes_data_upload.csv'
#voice = pd.read_csv(os.path.join('..', 'Resources', 'voice.csv'))
voice = pd.read_csv(file_path)
voice.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [3]:
# Assign X (data) and y (target)
X = voice.drop("class", axis=1)
y = voice["class"]
print(X.shape, y.shape)



(520, 16) (520,)


Split our data into training and testing

In [4]:
## Create dataframe 
data = {
    'lr_precision': [],
    'gnb_precision': [],
    'svc_precision': [],
    'rfc_precision': [],
    'lr_f1_score': [],
    'gnb_f1_score': [],
    'svc_f1_score': [],
    'rfc_f1_score': [],
    'lr_recall_score': [],
    'gnb_recall_score': [],
    'svc_recall_score': [],
    'rfc_recall_score': []
}

#[lr_precision, gnb_precision, svc_precision, rfc_precision, lr_f1_score, gnb_f1_score, svc_f1_score, rfc_f1_score, lr_recall_score, gnb_recall_score, svc_recall_score, rfc_recall_score]
outcomes = pd.DataFrame(data)
outcomes


Unnamed: 0,lr_precision,gnb_precision,svc_precision,rfc_precision,lr_f1_score,gnb_f1_score,svc_f1_score,rfc_f1_score,lr_recall_score,gnb_recall_score,svc_recall_score,rfc_recall_score


In [5]:
states = [0,1,2,3,4,5,6,7,8,9,11]

In [6]:
for x in states:
    #training datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=x)

    #create classifiers
    lr = LogisticRegression(max_iter=600)
    gnb = GaussianNB()
    svc = LinearSVC(C=1.0, max_iter=10000, dual = True)
    rfc = RandomForestClassifier()

    #train
    lr.fit(X_train, y_train)
    gnb.fit(X_train, y_train)
    svc.fit(X_train, y_train)
    rfc.fit(X_train, y_train)
    
    #defining predictions
    lr_y_pred= lr.predict(X_test)
    gnb_y_pred= gnb.predict(X_test)
    svc_y_pred= svc.predict(X_test)
    rfc_y_pred= rfc.predict(X_test)

    #getting metrics
    lr_precision = metrics.precision_score(y_test, lr_y_pred)
    gnb_precision = metrics.precision_score(y_test, gnb_y_pred)
    svc_precision = metrics.precision_score(y_test, svc_y_pred)
    rfc_precision = metrics.precision_score(y_test, rfc_y_pred)

    lr_f1_score = metrics.f1_score(y_test, lr_y_pred)
    gnb_f1_score = metrics.f1_score(y_test, gnb_y_pred)
    svc_f1_score = metrics.f1_score(y_test, svc_y_pred)
    rfc_f1_score = metrics.f1_score(y_test, rfc_y_pred)

    lr_recall_score = metrics.recall_score(y_test, lr_y_pred)
    gnb_recall_score = metrics.recall_score(y_test, gnb_y_pred)
    svc_recall_score = metrics.recall_score(y_test, svc_y_pred)
    rfc_recall_score = metrics.recall_score(y_test, rfc_y_pred)

    #create array
    append_data = {
        'lr_precision': lr_precision,
        'gnb_precision': gnb_precision,
        'svc_precision': svc_precision,
        'rfc_precision': rfc_precision,
        'lr_f1_score': lr_f1_score,
        'gnb_f1_score': gnb_f1_score,
        'svc_f1_score': svc_f1_score,
        'rfc_f1_score': rfc_f1_score,
        'lr_recall_score': lr_recall_score,
        'gnb_recall_score': gnb_recall_score,
        'svc_recall_score': svc_recall_score,
        'rfc_recall_score': rfc_recall_score
    }

    #APPEND DATA
    outcomes = outcomes.append(append_data, ignore_index = True)




In [7]:
outcomes

Unnamed: 0,lr_precision,gnb_precision,svc_precision,rfc_precision,lr_f1_score,gnb_f1_score,svc_f1_score,rfc_f1_score,lr_recall_score,gnb_recall_score,svc_recall_score,rfc_recall_score
0,0.922078,0.910256,0.934211,1.0,0.940397,0.934211,0.946667,0.993197,0.959459,0.959459,0.959459,0.986486
1,0.962963,0.929412,0.962963,0.977011,0.939759,0.929412,0.939759,0.988372,0.917647,0.929412,0.917647,1.0
2,0.946667,0.865854,0.948052,1.0,0.922078,0.881988,0.935897,0.993631,0.898734,0.898734,0.924051,0.987342
3,0.936709,0.935897,0.960526,0.987013,0.948718,0.941935,0.954248,0.987013,0.961039,0.948052,0.948052,0.987013
4,0.970588,0.842857,0.890244,0.986301,0.916667,0.808219,0.924051,0.966443,0.868421,0.776316,0.960526,0.947368
5,0.914634,0.935897,0.903614,0.974684,0.943396,0.941935,0.9375,0.987179,0.974026,0.948052,0.974026,1.0
6,0.959459,0.945205,0.959459,0.986667,0.959459,0.938776,0.959459,0.993289,0.959459,0.932432,0.959459,1.0
7,0.930556,0.857143,0.930556,0.973333,0.905405,0.862745,0.905405,0.966887,0.881579,0.868421,0.881579,0.960526
8,0.973684,0.935065,0.903614,1.0,0.954839,0.923077,0.925926,0.980645,0.936709,0.911392,0.949367,0.962025
9,0.975309,0.939759,0.97619,1.0,0.929412,0.906977,0.947977,0.99435,0.88764,0.876404,0.921348,0.988764


In [8]:
data = {
    'Precision': [outcomes.lr_precision.mean(),outcomes.gnb_precision.mean(),outcomes.svc_precision.mean(),outcomes.rfc_precision.mean()],
    'F1 Score': [outcomes.lr_f1_score.mean(),outcomes.gnb_f1_score.mean(),outcomes.svc_f1_score.mean(),outcomes.rfc_f1_score.mean()],
    'Recall Score': [outcomes.lr_recall_score.mean(),outcomes.gnb_recall_score.mean(),outcomes.svc_recall_score.mean(),outcomes.rfc_recall_score.mean()], 
}

#[lr_precision, gnb_precision, svc_precision, rfc_precision, lr_f1_score, gnb_f1_score, svc_f1_score, rfc_f1_score, lr_recall_score, gnb_recall_score, svc_recall_score, rfc_recall_score]
results = pd.DataFrame(data, index = ['Linear Regression', 'Gaussian NB', 'Linear SVC', 'Random Forest Classifier'])
results


Unnamed: 0,Precision,F1 Score,Recall Score
Linear Regression,0.947225,0.937285,0.929007
Gaussian NB,0.908083,0.908048,0.908854
Linear SVC,0.935941,0.938208,0.94155
Random Forest Classifier,0.989546,0.985869,0.982428


In [15]:
input_data = [[40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]

print(lr.predict(input_data))
print(gnb.predict(input_data))
print(svc.predict(input_data))
print(rfc.predict(input_data))

[1]
[0]
[1]
[1]


In [10]:
# print(f"Training Data Score: {classifier.score(X_train, y_train)}")
# print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

print(f'lr: {lr.score(X_test, y_test)}')
print(f'gnb: {gnb.score(X_test, y_test)}')
print(f'svc: {svc.score(X_test, y_test)}')
print(f'rfc: {rfc.score(X_test, y_test)}')

lr: 0.9384615384615385
gnb: 0.9
svc: 0.9307692307692308
rfc: 0.9923076923076923


In [11]:
print('Linear Regression')
print(classification_report(y_test, lr_y_pred))
print('Gaussian NB')
print(classification_report(y_test, gnb_y_pred))
print('Linear SVC')
print(classification_report(y_test, svc_y_pred))
print('Random Forrest Classifier')
print(classification_report(y_test, rfc_y_pred))


Linear Regression
              precision    recall  f1-score   support

           0       0.96      0.88      0.92        52
           1       0.93      0.97      0.95        78

    accuracy                           0.94       130
   macro avg       0.94      0.93      0.94       130
weighted avg       0.94      0.94      0.94       130

Gaussian NB
              precision    recall  f1-score   support

           0       0.91      0.83      0.87        52
           1       0.89      0.95      0.92        78

    accuracy                           0.90       130
   macro avg       0.90      0.89      0.89       130
weighted avg       0.90      0.90      0.90       130

Linear SVC
              precision    recall  f1-score   support

           0       0.94      0.88      0.91        52
           1       0.93      0.96      0.94        78

    accuracy                           0.93       130
   macro avg       0.93      0.92      0.93       130
weighted avg       0.93      0.93

In [12]:
# data2 = [lr_precision, gnb_precision, svc_precision, rfc_precision, lr_f1_score, gnb_f1_score, svc_f1_score, rfc_f1_score, lr_recall_score, gnb_recall_score, svc_recall_score, rfc_recall_score]

In [None]:
result = rfc.predict(input_data)

In [12]:
rfc.feature_importances_

array([0.10780428, 0.10258479, 0.22881684, 0.17216671, 0.05183445,
       0.02054851, 0.02887071, 0.01854381, 0.02585029, 0.03122613,
       0.04576038, 0.03248872, 0.04774957, 0.02526447, 0.04395245,
       0.0165379 ])

In [13]:
voice.columns

Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')