In [48]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

#Import models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [54]:
file_path =  'diabetes_data_upload.csv'
#voice = pd.read_csv(os.path.join('..', 'Resources', 'voice.csv'))
voice = pd.read_csv(file_path)
voice.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [55]:
# Assign X (data) and y (target)
X = voice.drop("class", axis=1)
y = voice["class"]
print(X.shape, y.shape)



(520, 16) (520,)


Split our data into training and testing

In [56]:
## Create dataframe 
data = {
    'lr_precision': [],
    'gnb_precision': [],
    'svc_precision': [],
    'rfc_precision': [],
    'lr_f1_score': [],
    'gnb_f1_score': [],
    'svc_f1_score': [],
    'rfc_f1_score': [],
    'lr_recall_score': [],
    'gnb_recall_score': [],
    'svc_recall_score': [],
    'rfc_recall_score': []
}

#[lr_precision, gnb_precision, svc_precision, rfc_precision, lr_f1_score, gnb_f1_score, svc_f1_score, rfc_f1_score, lr_recall_score, gnb_recall_score, svc_recall_score, rfc_recall_score]
outcomes = pd.DataFrame(data)
outcomes


Unnamed: 0,lr_precision,gnb_precision,svc_precision,rfc_precision,lr_f1_score,gnb_f1_score,svc_f1_score,rfc_f1_score,lr_recall_score,gnb_recall_score,svc_recall_score,rfc_recall_score


In [57]:
states = [0,1,2,3,4,5,6,7,8,9,10]

In [58]:
for x in states:
    #training datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=x)

    #create classifiers
    lr = LogisticRegression(max_iter=600)
    gnb = GaussianNB()
    svc = LinearSVC(C=1.0, max_iter=10000, dual = True)
    rfc = RandomForestClassifier()

    #train
    lr.fit(X_train, y_train)
    gnb.fit(X_train, y_train)
    svc.fit(X_train, y_train)
    rfc.fit(X_train, y_train)
    #defining predictions
    lr_y_pred= lr.predict(X_test)
    gnb_y_pred= gnb.predict(X_test)
    svc_y_pred= svc.predict(X_test)
    rfc_y_pred= rfc.predict(X_test)

    #getting metrics
    lr_precision = metrics.precision_score(y_test, lr_y_pred)
    gnb_precision = metrics.precision_score(y_test, gnb_y_pred)
    svc_precision = metrics.precision_score(y_test, svc_y_pred)
    rfc_precision = metrics.precision_score(y_test, rfc_y_pred)

    lr_f1_score = metrics.f1_score(y_test, lr_y_pred)
    gnb_f1_score = metrics.f1_score(y_test, gnb_y_pred)
    svc_f1_score = metrics.f1_score(y_test, svc_y_pred)
    rfc_f1_score = metrics.f1_score(y_test, rfc_y_pred)

    lr_recall_score = metrics.recall_score(y_test, lr_y_pred)
    gnb_recall_score = metrics.recall_score(y_test, gnb_y_pred)
    svc_recall_score = metrics.recall_score(y_test, svc_y_pred)
    rfc_recall_score = metrics.recall_score(y_test, rfc_y_pred)

    #create array
    append_data = {
        'lr_precision': lr_precision,
        'gnb_precision': gnb_precision,
        'svc_precision': svc_precision,
        'rfc_precision': rfc_precision,
        'lr_f1_score': lr_f1_score,
        'gnb_f1_score': gnb_f1_score,
        'svc_f1_score': svc_f1_score,
        'rfc_f1_score': rfc_f1_score,
        'lr_recall_score': lr_recall_score,
        'gnb_recall_score': gnb_recall_score,
        'svc_recall_score': svc_recall_score,
        'rfc_recall_score': rfc_recall_score
    }

    #APPEND DATA
    outcomes = outcomes.append(append_data, ignore_index = True)




In [59]:
outcomes

Unnamed: 0,lr_precision,gnb_precision,svc_precision,rfc_precision,lr_f1_score,gnb_f1_score,svc_f1_score,rfc_f1_score,lr_recall_score,gnb_recall_score,svc_recall_score,rfc_recall_score
0,0.922078,0.910256,0.898734,1.0,0.940397,0.934211,0.928105,0.993197,0.959459,0.959459,0.959459,0.986486
1,0.962963,0.929412,0.974359,0.977011,0.939759,0.929412,0.932515,0.988372,0.917647,0.929412,0.894118,1.0
2,0.946667,0.865854,0.946667,1.0,0.922078,0.881988,0.922078,0.993631,0.898734,0.898734,0.898734,0.987342
3,0.936709,0.935897,0.960526,1.0,0.948718,0.941935,0.954248,1.0,0.961039,0.948052,0.948052,1.0
4,0.970588,0.842857,0.985075,0.986842,0.916667,0.808219,0.923077,0.986842,0.868421,0.776316,0.868421,0.986842
5,0.914634,0.935897,0.903614,0.987179,0.943396,0.941935,0.9375,0.993548,0.974026,0.948052,0.974026,1.0
6,0.959459,0.945205,0.922078,0.973684,0.959459,0.938776,0.940397,0.986667,0.959459,0.932432,0.959459,1.0
7,0.930556,0.857143,0.918919,0.973333,0.905405,0.862745,0.906667,0.966887,0.881579,0.868421,0.894737,0.960526
8,0.973684,0.935065,0.973684,1.0,0.954839,0.923077,0.954839,0.980645,0.936709,0.911392,0.936709,0.962025
9,0.975309,0.939759,0.97619,1.0,0.929412,0.906977,0.947977,1.0,0.88764,0.876404,0.921348,1.0


In [60]:
data = {
    'Precision': [outcomes.lr_precision.mean(),outcomes.gnb_precision.mean(),outcomes.svc_precision.mean(),outcomes.rfc_precision.mean()],
    'F1 Score': [outcomes.lr_f1_score.mean(),outcomes.gnb_f1_score.mean(),outcomes.svc_f1_score.mean(),outcomes.rfc_f1_score.mean()],
    'Recall Score': [outcomes.lr_recall_score.mean(),outcomes.gnb_recall_score.mean(),outcomes.svc_recall_score.mean(),outcomes.rfc_recall_score.mean()], 
}

#[lr_precision, gnb_precision, svc_precision, rfc_precision, lr_f1_score, gnb_f1_score, svc_f1_score, rfc_f1_score, lr_recall_score, gnb_recall_score, svc_recall_score, rfc_recall_score]
results = pd.DataFrame(data, index = ['Linear Regression', 'Gaussian NB', 'Linear SVC', 'Random Forrest Classifier'])
results


Unnamed: 0,Precision,F1 Score,Recall Score
Linear Regression,0.947225,0.937285,0.929007
Gaussian NB,0.908083,0.908048,0.908854
Linear SVC,0.944161,0.935527,0.928782
Random Forrest Classifier,0.990732,0.9888,0.987053


In [47]:
input_data = [[40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]



print(lr.predict(input_data))
print(gnb.predict(input_data))
print(svc.predict(input_data))
print(rfc.predict(input_data))

[1]
[0]
[1]
[1]


Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity
446,50,0,1,1,1,0,1,0,0,0,0,1,1,0,0,0
390,47,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1
296,48,0,1,1,1,1,1,0,0,0,0,0,1,0,0,0
456,45,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0
24,58,1,1,1,1,1,1,0,1,0,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,36,0,1,1,1,0,1,0,1,0,1,1,1,1,0,0
280,40,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1
34,49,1,1,1,0,1,0,0,1,1,0,0,0,0,0,0
108,25,1,1,1,0,0,1,1,1,1,0,1,0,0,1,0


In [None]:
# print(f"Training Data Score: {classifier.score(X_train, y_train)}")
# print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

print(f'lr: {lr.score(X_test, y_test)}')
print(f'gnb: {gnb.score(X_test, y_test)}')
print(f'svc: {svc.score(X_test, y_test)}')
print(f'rfc: {rfc.score(X_test, y_test)}')

In [None]:
print('Linear Regression')
print(classification_report(y_test, lr_y_pred))
print('Gaussian NB')
print(classification_report(y_test, gnb_y_pred))
print('Linear SVC')
print(classification_report(y_test, svc_y_pred))
print('Random Forrest Classifier')
print(classification_report(y_test, rfc_y_pred))


In [None]:
# data2 = [lr_precision, gnb_precision, svc_precision, rfc_precision, lr_f1_score, gnb_f1_score, svc_f1_score, rfc_f1_score, lr_recall_score, gnb_recall_score, svc_recall_score, rfc_recall_score]