In [1]:
# Import Libraries
import pandas as pd
import numpy as np

Data Source: https://www.kaggle.com/datasets/falgunipatel19/biomedical-text-publication-classification

In [4]:
# Read datasets
data_df = pd.read_csv('data.csv', encoding="latin-1")

In [6]:
data_df.columns = ["Index", "Disease", "Text"]

In [7]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7570 entries, 0 to 7569
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Index    7570 non-null   int64 
 1   Disease  7570 non-null   object
 2   Text     7570 non-null   object
dtypes: int64(1), object(2)
memory usage: 177.5+ KB


In [8]:
# Filter the dataset based on specific columns
filtered_df = data_df[["Disease", "Text"]]

In [9]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7570 entries, 0 to 7569
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Disease  7570 non-null   object
 1   Text     7570 non-null   object
dtypes: object(2)
memory usage: 118.4+ KB


In [10]:
df = filtered_df.copy()

In [12]:
df

Unnamed: 0,Disease,Text
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,Thyroid_Cancer,This study aimed to investigate serum matrix ...
...,...,...
7565,Colon_Cancer,we report the case of a 24yearold man who pres...
7566,Colon_Cancer,among synchronous colorectal cancers scrcs rep...
7567,Colon_Cancer,the heterogeneity of cancer cells is generally...
7568,Colon_Cancer,"""adipogenesis is the process through which mes..."


In [13]:
df["Disease"].value_counts()

Thyroid_Cancer    2810
Colon_Cancer      2580
Lung_Cancer       2180
Name: Disease, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your dataset into a pandas dataframe called df
df = df

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Split the dataset into train and test sets
modal_df, val_df = train_test_split(df, test_size=0.1)

In [20]:
# Importing the required libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Flatten


# Reading the data into a pandas dataframe
df = modal_df

df['Text'] = df['Text'].astype(str)
df['Disease'] = df['Disease'].astype(str)

# Splitting the data into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Tokenizing the text and converting it to sequences
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['Text'].values)
train_sequences = tokenizer.texts_to_sequences(train_data['Text'].values)
test_sequences = tokenizer.texts_to_sequences(test_data['Text'].values)

# Padding the sequences to ensure uniform length
train_padded = pad_sequences(train_sequences, maxlen=500, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=500, padding='post', truncating='post')

# Defining the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for the <PAD> token

# Defining the embedding dimension
embedding_dim = 100  # You can adjust this value based on your data and model performance

# Defining the maximum length of the input sequences
max_length = 500  # You can adjust this value based on your data and model performance

# Encoding the target variable
encoder = LabelEncoder()
encoder.fit(train_data['Disease'].values)
train_labels = encoder.transform(train_data['Disease'].values)
test_labels = encoder.transform(test_data['Disease'].values)

# Computing the class weights to handle imbalanced data
class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)
class_weight_dict = dict(enumerate(class_weights))

num_classes = len(set(df['Disease']))

# Defining the model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(num_classes, activation='softmax')) # Changed the number of units in the output layer to num_classes

# Early stopping callback to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')

# Compiling the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model
history = model.fit(train_padded, train_labels, epochs=20, batch_size=64, validation_data=(test_padded, test_labels), class_weight=class_weight_dict, callbacks=[early_stopping])

# Evaluating the model on the test set
score, acc = model.evaluate(test_padded, test_labels, batch_size=64)
print('Test accuracy:', acc)

# Saving the model
model.save('Cancer_Text_Documents_Classification_ANN_your_model.h5')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 13: early stopping
Test accuracy: 0.9934800267219543


In [19]:
val_df 

Unnamed: 0,Disease,Text
2698,Lung_Cancer,"""Discussion This study supports the feasibilit..."
4748,Thyroid_Cancer,"""The coronavirus disease COVID19 is now a wor..."
1271,Thyroid_Cancer,"""unrestricted use distribution and reproductio..."
4255,Colon_Cancer,neutrophils account for  of circulating leuko...
6452,Thyroid_Cancer,CD146 was originally identiï¬ed as a melanoma...
...,...,...
1876,Thyroid_Cancer,Pathwayspecific model estimation for improve...
1497,Thyroid_Cancer,properly citedIntroduction Endogenously produc...
4628,Thyroid_Cancer,"""Ovine pulmonary adenocarcinoma OPA is a neopl..."
681,Thyroid_Cancer,"""Optimizing Telemedicine Encounters for Oral a..."


In [21]:
from tensorflow.keras.models import load_model

# Loading the model
model = load_model('Cancer_Text_Documents_Classification_ANN_your_model.h5')

# Get the content column from the validation dataframe
val_input = val_df['Text'].tolist()

# Converting validation input to sequence
val_seq = tokenizer.texts_to_sequences(val_input)

# Padding the sequence
val_padded = pad_sequences(val_seq, maxlen=max_length, padding='post', truncating='post')

# Predicting the EMISSION_CLASS for validation input
pred = model.predict(val_padded)

# Decoding the predicted label
pred_label = encoder.inverse_transform(np.argmax(pred, axis=-1))

# Get the actual labels from the validation dataframe
actual_label = val_df['Disease'].tolist()

# Calculate accuracy
accuracy = sum([1 if pred_label[i] == actual_label[i] else 0 for i in range(len(pred_label))]) / len(pred_label)

# Print the predicted and actual labels for the first 10 samples
for i in range(10):
    print(f"Predicted: {pred_label[i]}, Actual: {actual_label[i]}")

# Print the accuracy
print(f"Accuracy: {accuracy}")

Predicted: Lung_Cancer, Actual: Lung_Cancer
Predicted: Thyroid_Cancer, Actual: Thyroid_Cancer
Predicted: Thyroid_Cancer, Actual: Thyroid_Cancer
Predicted: Colon_Cancer, Actual: Colon_Cancer
Predicted: Thyroid_Cancer, Actual: Thyroid_Cancer
Predicted: Thyroid_Cancer, Actual: Thyroid_Cancer
Predicted: Thyroid_Cancer, Actual: Thyroid_Cancer
Predicted: Thyroid_Cancer, Actual: Thyroid_Cancer
Predicted: Lung_Cancer, Actual: Lung_Cancer
Predicted: Lung_Cancer, Actual: Lung_Cancer
Accuracy: 0.9956011730205279
