In [1]:
# Import Libraries
import pandas as pd
import numpy as np

Data Source: https://www.kaggle.com/datasets/falgunipatel19/biomedical-text-publication-classification

In [2]:
# Read datasets
data_df = pd.read_csv('data.csv', encoding="latin-1")

In [3]:
data_df.columns = ["Index", "Disease", "Text"]

In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7570 entries, 0 to 7569
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Index    7570 non-null   int64 
 1   Disease  7570 non-null   object
 2   Text     7570 non-null   object
dtypes: int64(1), object(2)
memory usage: 177.5+ KB


In [5]:
# Filter the dataset based on specific columns
filtered_df = data_df[["Disease", "Text"]]

In [6]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7570 entries, 0 to 7569
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Disease  7570 non-null   object
 1   Text     7570 non-null   object
dtypes: object(2)
memory usage: 118.4+ KB


In [7]:
df = filtered_df.copy()

In [8]:
df

Unnamed: 0,Disease,Text
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,Thyroid_Cancer,This study aimed to investigate serum matrix ...
...,...,...
7565,Colon_Cancer,we report the case of a 24yearold man who pres...
7566,Colon_Cancer,among synchronous colorectal cancers scrcs rep...
7567,Colon_Cancer,the heterogeneity of cancer cells is generally...
7568,Colon_Cancer,"""adipogenesis is the process through which mes..."


In [9]:
df["Disease"].value_counts()

Thyroid_Cancer    2810
Colon_Cancer      2580
Lung_Cancer       2180
Name: Disease, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your dataset into a pandas dataframe called df
df = df

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Split the dataset into train and test sets
modal_df, val_df = train_test_split(df, test_size=0.1)

In [None]:
# Importing the required libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D
from tensorflow.keras.models import Model


# Reading the data into a pandas dataframe
df = modal_df

# Splitting the data into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Initializing the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing the text and converting it to sequences
# train_sequences = [tokenizer.encode(text, add_special_tokens=True, max_length=512) for text in train_data['transcription'].values]
# test_sequences = [tokenizer.encode(text, add_special_tokens=True, max_length=512) for text in test_data['transcription'].values]
train_sequences = [tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True) for text in train_data['Text'].values]
test_sequences = [tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True) for text in test_data['Text'].values]

# Padding the sequences to ensure uniform length
train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=512, padding='post', truncating='post')
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=512, padding='post', truncating='post')

# Encoding the target variable
encoder = LabelEncoder()
encoder.fit(train_data['Disease'].values)
train_labels = encoder.transform(train_data['Disease'].values)
test_labels = encoder.transform(test_data['Disease'].values)

# Computing the class weights to handle imbalanced data
class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)
class_weight_dict = dict(enumerate(class_weights))

# Loading the pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Adding a classification layer on top of the BERT model
input_layer = Input(shape=(512,), dtype=tf.int32)
bert_output = bert_model(input_layer)[1]
output_layer = Dense(len(set(df['Disease'])), activation='softmax')(bert_output)
model = Model(inputs=input_layer, outputs=output_layer)

# Freezing the BERT layers to fine-tune only the classification layer
for layer in bert_model.layers:
    layer.trainable = False

# Early stopping callback to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')

# Compiling the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=2e-5), metrics=['accuracy'])

# Training the model
history = model.fit(train_padded, train_labels, epochs=1, batch_size=64, validation_data=(test_padded, test_labels), class_weight=class_weight_dict, callbacks=[early_stopping])

# Evaluating the model on the test set
score, acc = model.evaluate(test_padded, test_labels, batch_size=64)
print('Test accuracy:', acc)

# Saving the model
model.save('Cancer_Text Documents_Classification_Transformer.h5')

In [19]:
val_df 

Unnamed: 0,Disease,Text
2698,Lung_Cancer,"""Discussion This study supports the feasibilit..."
4748,Thyroid_Cancer,"""The coronavirus disease COVID19 is now a wor..."
1271,Thyroid_Cancer,"""unrestricted use distribution and reproductio..."
4255,Colon_Cancer,neutrophils account for  of circulating leuko...
6452,Thyroid_Cancer,CD146 was originally identiï¬ed as a melanoma...
...,...,...
1876,Thyroid_Cancer,Pathwayspecific model estimation for improve...
1497,Thyroid_Cancer,properly citedIntroduction Endogenously produc...
4628,Thyroid_Cancer,"""Ovine pulmonary adenocarcinoma OPA is a neopl..."
681,Thyroid_Cancer,"""Optimizing Telemedicine Encounters for Oral a..."


In [None]:
from tensorflow.keras.models import load_model

# Loading the model
model = load_model('Cancer_Text Documents_Classification_Transformer.h5')

# Get the content column from the validation dataframe
val_input = val_df['Text'].tolist()

# Converting validation input to sequence
val_seq = tokenizer.texts_to_sequences(val_input)

# Padding the sequence
val_padded = pad_sequences(val_seq, maxlen=max_length, padding='post', truncating='post')

# Predicting the EMISSION_CLASS for validation input
pred = model.predict(val_padded)

# Decoding the predicted label
pred_label = encoder.inverse_transform(np.argmax(pred, axis=-1))

# Get the actual labels from the validation dataframe
actual_label = val_df['Disease'].tolist()

# Calculate accuracy
accuracy = sum([1 if pred_label[i] == actual_label[i] else 0 for i in range(len(pred_label))]) / len(pred_label)

# Print the predicted and actual labels for the first 10 samples
for i in range(10):
    print(f"Predicted: {pred_label[i]}, Actual: {actual_label[i]}")

# Print the accuracy
print(f"Accuracy: {accuracy}")