# Notebook for disease prediction, with predefined symptoms using one hot encodi

## Import libraries

In [119]:
import pandas as pd
import numpy as np
import ast
import json
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import random
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

## Data Preprocessing

In [120]:
def get_pencernaan_data():
    """
    Get pandas.DataFrame for Pencernaan data.

    Params: None

    Return: pandas.DataFrame
    """
    try:
        df = pd.read_csv('list_penyakit.csv')
    except Exception as e:
        print(e)
        df = pd.DataFrame()  # Return an empty DataFrame in case of an error

    return df

In [121]:
def sample_data(dataframe, col_of_list, label_col, num_samples=5, n=5):
    """
    Sample data from a DataFrame where one column contains lists and another contains labels.

    Params:
    - dataframe (pd.DataFrame): The input DataFrame.
    - col_of_list (str): The column name which contains lists.
    - label_col (str): The column name which contains labels.
    - num_samples (int): Number of samples to generate for each record.
    - n (int): Number of elements to sample from each list.

    Returns:
    - pd.DataFrame: A new DataFrame with sampled data.
    """
    samples, labels = [], []
    col_of_list_index = dataframe.columns.to_list().index(col_of_list)
    label_col_index = dataframe.columns.to_list().index(label_col)

    for record_num in range(len(dataframe)):
        record_list = dataframe.iloc[record_num, col_of_list_index]
        record_label = dataframe.iloc[record_num, label_col_index]

        # Ensure record_list is a list
        if isinstance(record_list, str):
            try:
                record_list = ast.literal_eval(record_list)
            except (ValueError, SyntaxError):
                record_list = record_list.split(', ')
        elif not isinstance(record_list, list):
            record_list = list(record_list)

        if len(record_list) >= n:
            for _ in range(num_samples):
                samples.append(np.random.choice(record_list, n, replace=False).tolist())
                labels.append(record_label)
        else:
            for _ in range(num_samples):
                samples.append(np.random.choice(record_list, len(record_list), replace=False).tolist())
                labels.append(record_label)

    new_df = pd.DataFrame(list(zip(samples, labels)), columns=[col_of_list, label_col])
    return new_df

In [122]:
def sample_multiple_n(dataframe, col_of_list, label_col, num_samples=20, n_values=[5, 4, 3, 2]):
    """
    Create multiple sampled DataFrames for different values of n and concatenate them.

    Params:
    - dataframe (pd.DataFrame): The input DataFrame.
    - col_of_list (str): The column name which contains lists.
    - label_col (str): The column name which contains labels.
    - num_samples (int): Number of samples to generate for each record.
    - n_values (list of int): List of n values to use for sampling.

    Returns:
    - pd.DataFrame: A concatenated DataFrame with all samples.
    """
    sampled_dfs = [dataframe]

    for n in n_values:
        sampled_df = sample_data(dataframe, col_of_list, label_col, num_samples, n)
        sampled_dfs.append(sampled_df)

    concatenated_df = pd.concat(sampled_dfs).sort_values(by=[label_col]).reset_index(drop=True)
    return concatenated_df

In [123]:
def one_hot_encode_symptoms(dataframe, symptoms_col, label_col):
    """
    One-hot encoding the symptoms in the specified column and concatenate the result with the label column.

    Params:
    - dataframe (pd.DataFrame): The input DataFrame.
    - symptoms_col (str): The column name containing the symptoms lists.
    - label_col (str): The column name containing the labels.

    Returns:
    - pd.DataFrame: A DataFrame with one-hot encoded symptoms with the original labels.
    """
    # Ensure symptoms elements are lists
    dataframe[symptoms_col] = dataframe[symptoms_col].apply(
        lambda x: x if isinstance(x, list) else x.tolist() if isinstance(x, np.ndarray) else x.split(', ')
    )

    # Get all unique symptoms
    all_symptoms = sorted(list(set(sum(dataframe[symptoms_col], []))))

    # Create a binary matrix for one-hot encoding
    binary_matrix = {symptom: dataframe[symptoms_col].apply(lambda x: int(symptom in x)) for symptom in all_symptoms}

    # Create a one-hot encoded DataFrame
    one_hot_encoded_df = pd.DataFrame(binary_matrix)

    # Concatenate the label column with the one-hot encoded DataFrame
    df_final = pd.concat([dataframe[label_col], one_hot_encoded_df], axis=1)

    return df_final

In [124]:

def shuffle_and_split(dataframe, label_col, test_size=0.2, random_state=42):
    """
    Shuffle the DataFrame and split it into train and test sets, ensuring that each class in the label column
    exist in both sets.

    Params:
    - dataframe (pd.DataFrame): The input DataFrame.
    - label_col (str): The column name containing the class labels.
    - test_size (float): The proportion of the dataset to include in the test split.
    - random_state (int): Random seed for reproducibility.

    Returns:
    - df_train (pd.DataFrame): The training set.
    - df_test (pd.DataFrame): The test set.
    """
    # Shuffle the DataFrame
    df_shuffled = dataframe.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Split the data into train and test sets
    df_train, df_test = train_test_split(df_shuffled, test_size=test_size, stratify=df_shuffled[label_col],
                                         random_state=random_state)

    return df_train, df_test

In [125]:
def extract_features_and_labels(df_train, df_test, label_col):
    """
    Extract features (x) and labels (y) from training and testing DataFrames.

    Params:
    - df_train (pd.DataFrame): The training DataFrame.
    - df_test (pd.DataFrame): The testing DataFrame.
    - label_col (str): The column name containing the labels.

    Returns:
    - x_train (np.ndarray): Training features.
    - y_train (pd.Series): Training labels.
    - x_test (np.ndarray): Testing features.
    - y_test (pd.Series): Testing labels.
    """
    # Extract features and labels
    x_train, y_train = df_train.drop(label_col, axis=1), df_train[label_col]
    x_test, y_test = df_test.drop(label_col, axis=1), df_test[label_col]

    # Convert features to int32
    x_train, x_test = x_train.astype('int32'), x_test.astype('int32')

    # Convert features to numpy arrays
    x_train, x_test = x_train.values, x_test.values

    return x_train, y_train, x_test, y_test

In [126]:
def encode_labels(y_train, y_test):
    """
    Encode and one-hot encode labels for training and testing sets.

    Params:
    - y_train (pd.Series or np.ndarray): Training labels.
    - y_test (pd.Series or np.ndarray): Testing labels.

    Returns:
    - y_train_encoded (np.ndarray): Encoded and one-hot encoded training labels.
    - y_test_encoded (np.ndarray): Encoded and one-hot encoded testing labels.
    - label_encoder (LabelEncoder): Fitted LabelEncoder instance.
    """
    # Initialize LabelEncoder
    label_encoder = LabelEncoder()

    # Fit and transform the training labels
    y_train_encoded = label_encoder.fit_transform(y_train)

    # Transform the testing labels
    y_test_encoded = label_encoder.transform(y_test)

    # One-hot encode the labels
    y_train_encoded = tf.cast(to_categorical(y_train_encoded), tf.int32).numpy()
    y_test_encoded = tf.cast(to_categorical(y_test_encoded), tf.int32).numpy()

    return y_train_encoded, y_test_encoded, label_encoder

## Model Building

In [127]:
import tensorflow as tf

def build_model(num_classes):
    """
    Build a Keras Sequential model.

    Params:
    - num_classes (int): Number of output classes.

    Returns:
    - model (tf.keras.Model): Uncompiled Keras model.
    """
    tf.keras.backend.clear_session()
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(14, activation='softmax')
    ])
    return model

In [128]:
def compile_model(model, learning_rate=0.0001):
    """
    Compile a Keras model.

    Params:
    - model (tf.keras.Model): The Keras model to compile.
    - learning_rate (float): Learning rate for the optimizer.

    Returns:
    - model (tf.keras.Model): Compiled Keras model.
    """
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss=tf.keras.losses.CategoricalCrossentropy(),
                  metrics=['accuracy'])
    return model

In [129]:
def train_model(model, x_train, y_train, x_val, y_val, epochs=1000):
    """
    Train a Keras model.

    Params:
    - model (tf.keras.Model): The Keras model to train.
    - x_train (np.ndarray): Training features.
    - y_train (np.ndarray): Training labels.
    - x_val (np.ndarray): Validation features.
    - y_val (np.ndarray): Validation labels.
    - epochs (int): Number of epochs to train.

    Returns:
    - history (tf.keras.callbacks.History): History object containing training history.
    """
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val, y_val), callbacks=[early_stopping])
    return history

In [130]:
def save_model(model: tf.keras.models.Model) :
    model.save('predefined_model.h5')

## Model Training

In [131]:
def run_training():

    # Get symptoms and diseases data
    df = get_pencernaan_data()

    # Get sampled data
    df_sampled = sample_multiple_n(df, "Gejala", "Penyakit", num_samples=20)

    # Get one hot encoded data
    df_final = one_hot_encode_symptoms(df_sampled, 'Gejala', 'Penyakit')

    # Get training and testing set
    df_train, df_test = shuffle_and_split(df_final, 'Penyakit', test_size=0.2)

    # Get extracted feature and label of training and testing data
    x_train, y_train, x_test, y_test = extract_features_and_labels(df_train, df_test, 'Penyakit')

    # Encode labels
    y_train_encoded, y_test_encoded, label_encoder = encode_labels(y_train, y_test)

    # Determine input shape and number of classes
    num_classes = y_train_encoded.shape[1]

    # Define the model
    model = build_model(num_classes)

    # Compile the model
    model = compile_model(model)

    # Train the model
    _ = train_model(model, x_train, y_train_encoded, x_test, y_test_encoded)

    # Evaluate the model
    loss, accuracy = model.evaluate(x_test, y_test_encoded)
    print(f'Test accuracy: {accuracy:.2f}')
    print(f'Test loss: {loss:.2f}')

    save_model(model)

    # Get unique symptoms and labels
    unique_symptoms = set()
    for symptoms in df['Gejala'].str.split(', '):
        unique_symptoms.update(symptoms)
    with open('all_symptoms.txt', 'w') as txt_file:
        txt_file.write(', '.join(sorted(unique_symptoms)))

    unique_labels = df['Penyakit'].unique().tolist()
    class_dict = {i: label for i, label in enumerate(unique_labels)}
    with open('class_dict.json', 'w') as json_file:
        json.dump(class_dict, json_file)

if __name__ == "__main__" :
    run_training()

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

  saving_api.save_model(


## Inference

In [132]:
# Load the trained model
model = tf.keras.models.load_model('predefined_model.h5')

# Initialize the label encoder and load the necessary files
label_encoder = LabelEncoder()

with open('all_symptoms.txt', 'r') as symptoms_file:
    all_symptoms = [symptom.strip() for symptom in symptoms_file.read().split(',')]

with open('class_dict.json', 'r') as class_json:
    class_dict = json.load(class_json)

unique_classes = list(class_dict.values())
label_encoder.fit(unique_classes)

In [133]:
def preprocess_input(input_symptoms, all_symptoms):
    # Ensure only valid symptoms are considered
    input_symptoms = [symptom.strip() for symptom in input_symptoms if symptom.strip() in all_symptoms]

    # Create a binary representation of the symptoms
    input_data = [int(symptom in input_symptoms) for symptom in all_symptoms][:68]

    # Reshape to match the model's expected input shape
    return np.array(input_data).reshape((1, -1))

In [134]:
def predict_and_display(input_data, model, label_encoder):
    # Make predictions
    predictions = model.predict(input_data)

    # Get the index of the highest probability prediction
    max_prob_index = np.argmax(predictions)

    # Decode the predicted class from the index using the label_encoder
    predicted_class = label_encoder.inverse_transform([max_prob_index])[0]

    # Get the probability corresponding to the highest prediction
    max_prob = predictions[0][max_prob_index]

    # Display the predicted class and its probability
    print(f"Predicted Class: {predicted_class}, Probability: {max_prob:.2f}")

In [140]:
if __name__ == "__main__":
    # Get input symptoms from the user
    input_symptoms_str = input("Masukkan gejala yang dirasakan: ")
    input_symptoms = [symptom.strip() for symptom in input_symptoms_str.split(',')]

    # Preprocess input data and make predictions
    input_data = preprocess_input(input_symptoms, all_symptoms)
    predict_and_display(input_data, model, label_encoder)

Masukkan gejala yang dirasakan: mual muntah, perut kembung, pusing, dehidrasi
Predicted Class: Diare, Probability: 0.61


## Convert to tfjs

In [141]:
!pip install tensorflowjs

Collecting tensorflowjs
  Downloading tensorflowjs-4.20.0-py3-none-any.whl (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-decision-forests>=1.5.0 (from tensorflowjs)
  Downloading tensorflow_decision_forests-1.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
Collecting packaging~=23.1 (from tensorflowjs)
  Downloading packaging-23.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow<3,>=2.13.0 (from tensorflowjs)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2

In [1]:
!tensorflowjs_converter \
--input_format=keras \
/content/predefined_model.h5 \
/content/model

2024-06-13 15:25:56.876081: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
!zip -r predefined_model_disease_prediction_json.zip model/

  adding: model/ (stored 0%)
  adding: model/group1-shard1of1.bin (deflated 8%)
  adding: model/model.json (deflated 75%)
