In [11]:
import json
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import joblib


In [2]:
# Load and preprocess the data
with open("idmanual (1).json") as f:  # Replace "your_data.json" with your JSON file path
    data = json.load(f)

texts = []
labels = []

for entry in data:
    texts.append(entry['description'])
    labels.append(entry['class_id'])

texts = np.array(texts)
labels = np.array(labels)

In [3]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [4]:
# Pad sequences
max_seq_length = 100  # Replace with your desired sequence length
sequences = pad_sequences(sequences, maxlen=max_seq_length)

In [5]:
# Convert labels to integer format
label_to_id = {label: idx for idx, label in enumerate(np.unique(labels))}
id_to_label = {idx: label for label, idx in label_to_id.items()}
labels = np.array([label_to_id[label] for label in labels])

In [None]:
# Define early stopping callback
early_stopping = EarlyStopping(patience=2, restore_best_weights=True)

In [6]:
# Define the GRU model
embedding_dim = 100  # Replace with your desired embedding dimension
num_classes = len(np.unique(labels))
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_seq_length))
model.add(GRU(128))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [7]:
# Define early stopping callback
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

# Train the model
model.fit(sequences, labels, epochs=10, batch_size=32, verbose=1, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1edd30d5520>

In [8]:
# Evaluate the model on the entire dataset
loss, accuracy = model.evaluate(sequences, labels)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9960648417472839


In [9]:
# Convert the predicted labels back to their original string format
y_pred = np.argmax(model.predict(sequences), axis=-1)
predicted_labels = np.array([id_to_label[pred] for pred in y_pred])



In [12]:
# Save the model
joblib.dump(model, 'trained_model.joblib')

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\embedding
......vars
.........0
...layers\gru
......vars
...layers\gru\cell
......vars
.........0
.........1
.........2
...metrics\mean
......vars
.........0
.........1
...metrics\mean_metric_wrapper
......vars
.........0
.........1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-05-25 11:14:32         2211
metadata.json                                  2023-05-25 11:14:32           64
variables.h5                                   2023-05-25 11:14:32     22844592


['trained_model.joblib']

In [14]:
# Take user input and make predictions
user_input = input("Enter a description: ")
user_sequence = tokenizer.texts_to_sequences([user_input])
user_sequence = pad_sequences(user_sequence, maxlen=max_seq_length)
prediction = model.predict(user_sequence)
predicted_class_id = np.argmax(prediction)
predicted_class_label = id_to_label[predicted_class_id]
print(f'Predicted Class: {predicted_class_label}')

Enter a description:  i have new drug composition
Predicted Class: 042
