In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'pimadiabetescsv:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F945344%2F1601678%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240911%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240911T051900Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db3d57b5b4474ee71bd32264ed5eb0fcf4fb79df2c3079ae204588ca137c977e5d59680afa1b5be3faed348983e31a749534b7992402ca5bd062161065a85bc9faaf6dc05a5aa7a1177163eada512b746f3824263e36d8d45264d2debf02da2562b678c0a14c11ab42acced3ed55c8ea6e8ef8e6950af018c7656dace653c5b10a94976c46f65b21d549d9488a98300d6f0596b165132baadc317eae4ef0ad062cdbf3c4e5ecab17bc18f39edb6352395d5154a9b07c33405fc40bb5a5644952309cc5b521dad78dd650692a4358993b9da6072b91340a93df9412be5839ddf35900806a9874db55fece45de64ef05d29f3644c1c28a28653baf523119dd25c6b,multiples:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F993686%2F1678469%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240911%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240911T051900Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D634296d3f670d082bbafb68afb10cf6a19601e180168aa9f8aff2799276e72287074715a60364e01313da85ef25ae99a1db33aa2c7e05420d48e8873c54fd6cf9b8e9fb04f98708a2d5973dbd1986a71ff996f1257cfba4993f398b32eff419d7b4cd270036f931b4e7a079dcd3a98f7638e9cabb0b97a4fd7f4f4e73256b3e83ef172a8a0701bbe9c7eff14bad6e4a057a8267f5a4a9b9b0577caabc13d182ff6e0e00235bbd3bbe233da6f76e1dde621c826310e5ec38fd12cae9d52bad055afef05f3184597c94984709e66b2ff75c9d3a1e1c1dc29cf1123670f1e89bdd99a5f9f2624bdf02d8453547df6b1af40f79dba0e8ef49d96c7be5a51ec74fce2'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Pima Diabetes Dataset Analysis (98% Accuracy & 98% F1 Score)

### Using Keras(Dense Layer), Optuna(Hyperparameter optimization) & RFECV (Recursive Feature Elimination)


# Author
### Pradeep Gurav
##### https://www.linkedin.com/in/pradeepgurav/

#### November 26, 2020

##### How to get Pima Diabetes Analysis results > 98%
Most of the analysis/models results on Kaggle are around 78%. Piotr's Analysis Notebook which was the highest so far who had achieved 84% accuracy.

There are many notebooks describing Data exploration and understanding Pima Diabetes data, so will not cover that part in this report. From the data exploration, many of the features(such as Insulin, SkinThickness etc) had overlapping distribution over the outcomes(Diabetic-/NonDiabetic), so it was going to be hard relying on those features.

Tried many methods to improve the accuracy such as:

    1. Deleting records with zero values
    2. Imputing zero values using KNN, mean values
    3. Clipping extreme values or out of range values, bucketing certain features (Age, Pregnancies)
    4. Applying LogTransform so as to shift some of the feature's distribution towards Normal. No Good results.
    5. Many other imputing methods including my own method (Dense Layer network - regression values)
    6. Feature separation, using two separate models using diff features and concatenating the models
    7. And also using Optuna to try thousands of combinations of layers, Dense layer units, lr, batch sizes. Optuna is  a fantastic tool.
    8. Also used RFECV & PCA to identify most co-relating features
    9. What actually contributed the most to the performance is the 'Data Augmentation' method

The below code is self explanatory, here is the summary of steps:

    1. Splitted original data into 75:25. Saved both as CSVs again. The 25% dataset was not exposed to the model while training.
    2. Used 75% dataset(576 records) and added new 76992 records to the original dataset.
    3. Used Optuna hyperoptimization to get optimum layers and units.
    4. Saved the best model and used the same model once Optuna optimization was completed
    5. Used the 25% data kept aside for testing, which have 98% accuracy (same 98% for most classification parameters)
    6. This model could lead to accuracy above 99% with a bit more effort
       
Data files used:

    1. Public known data      ../input/pimadiabetescsv/diabetes.csv
    2. file with 100 times    ../input/multiples/diabetes75pc_100_times.csv
    3. 25% testdata unsued in training    ../input/multiples/diabetes25pc.csv

In [None]:
#!pip install optuna
from __future__ import absolute_import, division, print_function
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
from keras.backend import clear_session
from keras.optimizers import RMSprop, Adam
from keras import regularizers

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import seaborn as sns
import optuna
from optuna.integration import TFKerasPruningCallback

from subprocess import check_output
import time
import warnings

import os
import sys

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [None]:
#This runtime value is used by Optuna to run the permutations required for hyper-parameter optimizations.
#The results in this report were achieved over 1 hour run of the Optuna code.
runtime = 3600

In [None]:
ori = pd.read_csv("../input/multiples/diabetes75pc_100_times.csv")

'''
# The below code shows how I split the original data to 75:25 ratio
# I wrote a function to fabricate (any number of) new records as "multiplydata_pcversion" function

ori = pd.read_csv("../input/pimadiabetescsv/diabetes.csv")

Train_data = ori[:576]
Test_data = ori[576:]
Test_data.shape[0]

trainfile = "C://Users//pg//Documents//datascience//case studies//pima//data//diabetes75pc.csv"
testfile = "C://Users//pg//Documents//datascience//case studies//pima//data//diabetes25pc.csv"
Train_data.to_csv(trainfile, index=False, line_terminator='\n')
#Output ->   ../input/multiples/diabetes75pc_100_times.csv
Test_data.to_csv(testfile, index=False, line_terminator='\n')
#Output ->   ../input/multiples/diabetes25pc.csv

multiple = 50
multiplydata_pcversion(multiple,trainfile)
'''
print("Check the number of records:  ", ori.shape)
ori.head()

### Data Exploration

The below heatmap shows correlation among all the features and the Outcome.

In [None]:
sns.heatmap(ori.corr(),annot=True, cmap = 'YlGnBu')
fig = plt.gcf()
fig.set_size_inches(10, 8)
plt.show()

In [None]:
ax = sns.violinplot(x='Outcome', y='BloodPressure', data=ori, palette='muted', split=True)

From the above plot we can see that BloodPressure has hardly any effect on the Outcome(Diabetic/NonDiabetic). So we can easily drop similar other features from our model.

In [None]:
pdata = ori.copy(deep=True)
feature_names = pdata.columns[:8]
X = pdata[feature_names]
y = pdata.Outcome

# Features chosen based on RFECV result
best_features = ['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction']
X = StandardScaler().fit_transform(X[best_features])
# Splitting  data into training and testing (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20
)

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

study_name = "PIMAFeatureEng"
checkpoint_path = './Pima26Nov.hdf5'
checkpoint_dir = os.path.dirname(checkpoint_path)

# create checkpoint callback
cp_callback = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                              monitor='val_accuracy',
                                              save_weights_only=False,
                                              save_best_only=True,
                                              verbose=0
                                             )

with tpu_strategy.scope():
    def objective(trial):
        # Clear clutter from previous Keras session graphs.
        clear_session()

        num_epochs = 100
            # Create callbacks for early stopping and pruning.
        callbacks = [
            keras.callbacks.EarlyStopping(patience=3),
            TFKerasPruningCallback(trial, "val_accuracy"),
            cp_callback
        ]
        model = Sequential()
        for i in range(3):
            model.add(Dense(int(trial.suggest_discrete_uniform(
                'FC_{}_num_hidden_units'.format(i), 16, 80, 4)),
                            activation = "relu",
                            input_dim=4
                           )
                     )
        model.add(Dense(1, activation = 'sigmoid'))
        lr = trial.suggest_uniform("lr", 1e-4, 1e-1)
        model.compile(loss = 'binary_crossentropy', optimizer = Adam(lr=lr), metrics=['accuracy'])
        batch_size = trial.suggest_int('Batch_size', 32, 128, 16)
        history = model.fit(X_train,
                           y_train,
                           validation_data= (X_test, y_test),
                           epochs=num_epochs,
                           batch_size = batch_size,
                           callbacks=callbacks,
                           verbose=0
                           )
        score = model.evaluate(X_test, y_test, verbose=0)
        return score[1]

In [None]:
tic = time.process_time()
#The below line is important to see the logs while Optuna optimization
optuna.logging.disable_default_handler()
study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_prior=True, prior_weight=1.0,
            consider_magic_clip=True, consider_endpoints=False,
            n_startup_trials=10, n_ei_candidates=24,
            seed=None),
        pruner=optuna.pruners.SuccessiveHalvingPruner(
            min_resource=2, reduction_factor=4, min_early_stopping_rate=1),
        study_name = study_name,
        direction="maximize",
)

#study.optimize(objective, n_trials=100, timeout=600)
study.optimize(objective, timeout=runtime)

toc = time.process_time()
print("time taken :  ")
print(toc-tic)
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial number:", study.best_trial.number)
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


In [None]:
#ls {checkpoint_path}
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
optuna.visualization.plot_intermediate_values(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
#Load the optimized model which is saved at checkpoint_path
print(checkpoint_path)
new_model = keras.models.load_model(checkpoint_path)
new_model.summary()

In [None]:
scores = new_model.evaluate(X_test, y_test)
py_pred  = new_model.predict(X_test)
py_pred = np.where(py_pred > 0.5, 1, 0) #py_pred = py_pred.round()
print("Accuracy of the training", scores[1]*100)

In [None]:
cm = confusion_matrix(y_test, py_pred)

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes =True)
sns.set(font_scale=1.5)
sns.heatmap(cm, annot=True, fmt='g')
plt.show()

Confusion Matrix shows True Positives, True Negatives, False Positives(FP) and False Negatives(FN). 0 FPs and 0 FNs is the ideal outcome.

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report

print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, py_pred) * 100))
print('Classification report:\n\n', classification_report(y_test, py_pred))


In [None]:
# Load the test data which was kept aside in the beginning
test = pd.read_csv("../input/multiples/diabetes25pc.csv")
best_features = ['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction']
X1 = StandardScaler().fit_transform(test[best_features])
y1 = test.Outcome

In [None]:
py_pred1  = new_model.predict(X1)
py_pred1 = np.where(py_pred1 > 0.5, 1, 0) #py_pred = py_pred.round()

In [None]:
cm = confusion_matrix(y1, py_pred1)

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes =True)
sns.set(font_scale=1.5)
sns.heatmap(cm, annot=True, fmt='g')
plt.show()

In [None]:
print('Accuracy: {:.2f}%'.format(accuracy_score(y1, py_pred1) * 100))
print('Classification report:\n\n', classification_report(y1, py_pred1))

### Discussion on Results achieved

1. The report describes various methods tried to optimize the results
2. It provides a framework which uses various tools and techniques to solve similar problems
3. There is significant jump in accuracy improvement from 84% to 98%

    
### References /Acknowledgements

####  1. Notebook by Piotr Tynecki on Pima Dataset:->    https://www.kaggle.com/ptynecki/pima-indians-diabetes-prediction-with-lr-84



####  2. Optuna - used for hyperparameter optimization:->    https://optuna.org/



####  3. Kaggle - Thanks for providing this fantastic platform and allowing to use GPUs/TPUs