## Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

## Process Data

In [2]:
def process_data(path = 'data/diabetes_data.csv'):
    data = pd.read_csv(path)

    features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
    target   = 'Outcome'

    X = data[features]
    y = data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    return X_train, X_test, y_train, y_test

## Read the Data

In [3]:
X_train, X_test, y_train, y_test = process_data(path = 'data/diabetes_data.csv')

## Import Models

In [4]:
from sklearn.linear_model   import LogisticRegression 
from sklearn.tree           import DecisionTreeClassifier
from sklearn.svm            import LinearSVC
from sklearn.svm            import SVC 
from sklearn.ensemble       import RandomForestClassifier 
from sklearn.ensemble       import GradientBoostingClassifier

## Instantiate the Models

In [5]:
model_lr   = LogisticRegression()
model_dtc  = DecisionTreeClassifier()
model_lsvc = LinearSVC()
model_svc  = SVC()
model_rfc  = RandomForestClassifier()
model_gbc  = GradientBoostingClassifier()

## Train the Model

In [None]:
model_lr.fit(X_train, y_train)
model_dtc.fit(X_train, y_train)
model_lsvc.fit(X_train, y_train)
model_svc.fit(X_train, y_train)
model_rfc.fit(X_train, y_train)
model_gbc.fit(X_train, y_train)

## Save the Models


In [7]:
import joblib 

filename_lr   = 'trained_models/trained_model_lr.joblib'
filename_dtc  = 'trained_models/trained_model_dtc.joblib'
filename_lsvc = 'trained_models/trained_model_lsvc.joblib'
filename_svc  = 'trained_models/trained_model_svc.joblib'
filename_rfc  = 'trained_models/trained_model_rfc.joblib'
filename_gbc  = 'trained_models/trained_model_gbc.joblib'

joblib.dump(model_lr,   filename = 'trained_models/trained_model_lr.joblib') 
joblib.dump(model_dtc,  filename = 'trained_models/trained_model_dtc.joblib') 
joblib.dump(model_lsvc, filename = 'trained_models/trained_model_lsvc.joblib') 
joblib.dump(model_svc,  filename = 'trained_models/trained_model_svc.joblib') 
joblib.dump(model_rfc,  filename = 'trained_models/trained_model_rfc.joblib') 
joblib.dump(model_gbc,  filename = 'trained_models/trained_model_gbc.joblib') 

['trained_models/trained_model_gbc.joblib']

## Check the Accuracy of the Models 

In [8]:
accuracy_lr   = model_lr.score(X_test, y_test)
accuracy_dtc  = model_dtc.score(X_test, y_test)
accuracy_lsvc = model_lsvc.score(X_test, y_test)
accuracy_svc  = model_svc.score(X_test, y_test)
accuracy_rfc  = model_rfc.score(X_test, y_test)
accuracy_gbc  = model_gbc.score(X_test, y_test)

print(f'LogisticRegression          :   {round(accuracy_lr   * 100, 2)} %')
print(f'DecisionTreeClassifier      :   {round(accuracy_dtc  * 100, 2)} %')
print(f'LinearSVC                   :   {round(accuracy_lsvc * 100, 2)} %')
print(f'SVC                         :   {round(accuracy_svc  * 100, 2)} %')
print(f'RandomForestClassifier      :   {round(accuracy_rfc  * 100, 2)} %')
print(f'GradientBoostingClassifier  :   {round(accuracy_gbc  * 100, 2)} %')

LogisticRegression          :   77.27 %
DecisionTreeClassifier      :   66.88 %
LinearSVC                   :   68.83 %
SVC                         :   77.27 %
RandomForestClassifier      :   72.73 %
GradientBoostingClassifier  :   74.68 %


## Load Trained Models

In [9]:
model_lr   = joblib.load('trained_models/trained_model_lr.joblib')
model_dtc  = joblib.load('trained_models/trained_model_dtc.joblib')
model_lsvc = joblib.load('trained_models/trained_model_lsvc.joblib')
model_svc  = joblib.load('trained_models/trained_model_svc.joblib')
model_rfc  = joblib.load('trained_models/trained_model_rfc.joblib')
model_gbc  = joblib.load('trained_models/trained_model_gbc.joblib')

## Create A Dummy Data

In [10]:
def new_data(x1, x2, x3, x4, x5, x6, x7, x8):
    return np.array([[x1, x2, x3, x4, x5, x6, x7, x8]])

data1 = new_data(1, 85, 66, 29, 0, 26.6, 0.351, 31)
data2 = new_data(6,148,72,35,0,33.6,0.627,50)

print(data1)
print(data2)

[[ 1.    85.    66.    29.     0.    26.6    0.351 31.   ]]
[[  6.    148.     72.     35.      0.     33.6     0.627  50.   ]]


## Make Prediction

In [None]:
predict_lr   = model_lr.predict(data1)
predict_dtc  = model_dtc.predict(data1)
predict_lsvc = model_lsvc.predict(data1)
predict_svc  = model_svc.predict(data1)
predict_rfc  = model_rfc.predict(data1)
predict_gbc  = model_gbc.predict(data1)

In [12]:
def result(pred):
    if pred == 0:
        return 'Negative'
    if pred == 1:
        return 'Positive'

## Print the Predicted Results

In [13]:
print(f'LogisticRegression          :   {predict_lr[0]} - {result(predict_lr[0])}')
print(f'DecisionTreeClassifier      :   {predict_dtc[0]} - {result(predict_dtc[0])}')
print(f'LinearSVC                   :   {predict_lsvc[0]} - {result(predict_lsvc[0])}')
print(f'SVC                         :   {predict_svc[0]} - {result(predict_svc[0])}')
print(f'RandomForestClassifier      :   {predict_rfc[0]} - {result(predict_rfc[0])}')
print(f'GradientBoostingClassifier  :   {predict_gbc[0]} - {result(predict_gbc[0])}')

LogisticRegression          :   0 - Negative
DecisionTreeClassifier      :   0 - Negative
LinearSVC                   :   0 - Negative
SVC                         :   0 - Negative
RandomForestClassifier      :   0 - Negative
GradientBoostingClassifier  :   0 - Negative


In [None]:
predict_lr   = model_lr.predict(data2)
predict_dtc  = model_dtc.predict(data2)
predict_lsvc = model_lsvc.predict(data2)
predict_svc  = model_svc.predict(data2)
predict_rfc  = model_rfc.predict(data2)
predict_gbc  = model_gbc.predict(data2)

In [15]:
print(f'LogisticRegression          :   {predict_lr[0]} - {result(predict_lr[0])}')
print(f'DecisionTreeClassifier      :   {predict_dtc[0]} - {result(predict_dtc[0])}')
print(f'LinearSVC                   :   {predict_lsvc[0]} - {result(predict_lsvc[0])}')
print(f'SVC                         :   {predict_svc[0]} - {result(predict_svc[0])}')
print(f'RandomForestClassifier      :   {predict_rfc[0]} - {result(predict_rfc[0])}')
print(f'GradientBoostingClassifier  :   {predict_gbc[0]} - {result(predict_gbc[0])}')

LogisticRegression          :   1 - Positive
DecisionTreeClassifier      :   1 - Positive
LinearSVC                   :   0 - Negative
SVC                         :   1 - Positive
RandomForestClassifier      :   1 - Positive
GradientBoostingClassifier  :   1 - Positive


In [16]:
def process_data(path = 'data/diabetes_data.csv', target='outcome'):

    data = pd.read_csv(path)


    features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
    target   = 'Outcome'

    X = data[features]
    y = data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    return X_train, X_test, y_train, y_test

## Generalized Preprocessing

In [17]:

def general_preprocess(path, target):
    data = pd.read_csv(path)
    new_columns = []

    for i in data.columns:
        data[i.lower().strip()] = data[i]
        new_columns.append(i.lower().strip())

    new_data = data[new_columns]
    new_data = new_data.drop_duplicates()
    new_data = new_data.dropna()

    target = target.lower().strip()

    X = new_data.drop(target, axis=1)
    y = new_data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = general_preprocess('data/diabetes_data.csv', "Outcome")



## Instantiate, Train, and Saved Model

In [None]:
import joblib 

def train_and_save_model(X_train, X_test, y_train, y_test):
    model_lr   = LogisticRegression()
    model_dtc  = DecisionTreeClassifier()
    model_lsvc = LinearSVC()
    model_svc  = SVC()
    model_rfc  = RandomForestClassifier()
    model_gbc  = GradientBoostingClassifier()

    model_lr.fit(X_train, y_train)
    model_dtc.fit(X_train, y_train)
    model_lsvc.fit(X_train, y_train)
    model_svc.fit(X_train, y_train)
    model_rfc.fit(X_train, y_train)
    model_gbc.fit(X_train, y_train)

    joblib.dump(model_lr,   filename = 'gen_trained_models/trained_lr.joblib') 
    joblib.dump(model_dtc,  filename = 'gen_trained_models/trained_dtc.joblib') 
    joblib.dump(model_lsvc, filename = 'gen_trained_models/trained_lsvc.joblib') 
    joblib.dump(model_svc,  filename = 'gen_trained_models/trained_svc.joblib') 
    joblib.dump(model_rfc,  filename = 'gen_trained_models/trained_rfc.joblib') 
    joblib.dump(model_gbc,  filename = 'gen_trained_models/trained_gbc.joblib') 

    return 

train_and_save_model(X_train, X_test, y_train, y_test)

## Check the Accuraces

In [19]:
def check_accuracies(X_train, X_test, y_train, y_test):
    accuracy_lr   = model_lr.score(X_test, y_test)
    accuracy_dtc  = model_dtc.score(X_test, y_test)
    accuracy_lsvc = model_lsvc.score(X_test, y_test)
    accuracy_svc  = model_svc.score(X_test, y_test)
    accuracy_rfc  = model_rfc.score(X_test, y_test)
    accuracy_gbc  = model_gbc.score(X_test, y_test)

    print(f'LogisticRegression          :   {round(accuracy_lr   * 100, 2)} %')
    print(f'DecisionTreeClassifier      :   {round(accuracy_dtc  * 100, 2)} %')
    print(f'LinearSVC                   :   {round(accuracy_lsvc * 100, 2)} %')
    print(f'SVC                         :   {round(accuracy_svc  * 100, 2)} %')
    print(f'RandomForestClassifier      :   {round(accuracy_rfc  * 100, 2)} %')
    print(f'GradientBoostingClassifier  :   {round(accuracy_gbc  * 100, 2)} %')

check_accuracies(X_train, X_test, y_train, y_test)

LogisticRegression          :   83.12 %
DecisionTreeClassifier      :   93.51 %
LinearSVC                   :   74.03 %
SVC                         :   80.52 %
RandomForestClassifier      :   93.51 %
GradientBoostingClassifier  :   90.91 %


Feature names unseen at fit time:
- age
- bloodpressure
- bmi
- diabetespedigreefunction
- glucose
- ...
Feature names seen at fit time, yet now missing:
- Age
- BMI
- BloodPressure
- DiabetesPedigreeFunction
- Glucose
- ...

Feature names unseen at fit time:
- age
- bloodpressure
- bmi
- diabetespedigreefunction
- glucose
- ...
Feature names seen at fit time, yet now missing:
- Age
- BMI
- BloodPressure
- DiabetesPedigreeFunction
- Glucose
- ...

Feature names unseen at fit time:
- age
- bloodpressure
- bmi
- diabetespedigreefunction
- glucose
- ...
Feature names seen at fit time, yet now missing:
- Age
- BMI
- BloodPressure
- DiabetesPedigreeFunction
- Glucose
- ...

Feature names unseen at fit time:
- age
- bloodpressure
- bmi
- diabetespedigreefunction
- glucose
- ...
Feature names seen at fit time, yet now missing:
- Age
- BMI
- BloodPressure
- DiabetesPedigreeFunction
- Glucose
- ...

Feature names unseen at fit time:
- age
- bloodpressure
- bmi
- diabetespedigreefunction
- gluco

## Load Train Model

In [20]:
def load_models():
    model_lr   = joblib.load('trained_models/trained_model_lr.joblib')
    model_dtc  = joblib.load('trained_models/trained_model_dtc.joblib')
    model_lsvc = joblib.load('trained_models/trained_model_lsvc.joblib')
    model_svc  = joblib.load('trained_models/trained_model_svc.joblib')
    model_rfc  = joblib.load('trained_models/trained_model_rfc.joblib')
    model_gbc  = joblib.load('trained_models/trained_model_gbc.joblib')
    return model_lr, model_dtc, model_lsvc, model_svc, model_rfc, model_gbc

load_models()

(LogisticRegression(),
 DecisionTreeClassifier(),
 LinearSVC(),
 SVC(),
 RandomForestClassifier(),
 GradientBoostingClassifier())

In [21]:
model_lr, model_dtc, model_lsvc, model_svc, model_rfc, model_gbc = load_models()

In [22]:
loaded_models = [model_lr, model_dtc, model_lsvc, model_svc, model_rfc, model_gbc]
print(loaded_models)


[LogisticRegression(), DecisionTreeClassifier(), LinearSVC(), SVC(), RandomForestClassifier(), GradientBoostingClassifier()]


## Make Predictions

In [23]:
def new_data(x1, x2, x3, x4, x5, x6, x7, x8):
    return np.array([[x1, x2, x3, x4, x5, x6, x7, x8]])

def result(pred):
    if pred == 0:
        return 'Negative'
    if pred == 1:
        return 'Positive'

data1 = new_data(1, 85, 66, 29, 0, 26.6, 0.351, 31)
data2 = new_data(6,148,72,35,0,33.6,0.627,50)



In [24]:

for i in loaded_models:
    print(f" {str(i).replace('()', '') :30} : {i.predict(data2)[0]}   ==>   {result(i.predict(data2)[0])}")

 LogisticRegression             : 1   ==>   Positive
 DecisionTreeClassifier         : 1   ==>   Positive
 LinearSVC                      : 0   ==>   Negative
 SVC                            : 1   ==>   Positive
 RandomForestClassifier         : 1   ==>   Positive
 GradientBoostingClassifier     : 1   ==>   Positive


