In [1]:
!pip install transformers



In [2]:
from torch.utils.data import Dataset

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('/kaggle/input/biomedical-text-publication-classification/alldata_1_for_kaggle.csv',encoding="latin1")

In [5]:
df = df.drop('Unnamed: 0',axis=1)
df = df.rename({'0':'Cancer','a':'Research paper text'},axis=1)
df.head()

Unnamed: 0,Cancer,Research paper text
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,Thyroid_Cancer,This study aimed to investigate serum matrix ...


In [6]:
cancer_types = df['Cancer'].value_counts()

In [7]:
cancer_types

Cancer
Thyroid_Cancer    2810
Colon_Cancer      2580
Lung_Cancer       2180
Name: count, dtype: int64

In [8]:
import nltk
from nltk.corpus import stopwords         # to get a list of stopwords
from nltk.tokenize import word_tokenize   # for tokenization of words
from nltk.stem import WordNetLemmatizer   # to find root words
from string import punctuation            # to get a list of puncutation symbols



In [9]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [10]:
# Cleaning the data
def remove_spaces(data):
    clean_text = data.replace("\\n"," ").replace("\t"," ").replace("\\"," ")
    return clean_text

# defining the function for removing stopwords
stopword = stopwords.words("english") # gives a list of stopwords

def clean_text(data):
    token = word_tokenize(data)
    clean_text = [i.lower() for i in token if (i not in punctuation) 
                  and (i.lower() not in stopword) and (i.isalpha()) and (len(i) > 2)]
    return clean_text

# defining the function for getting root words
def lemmatization(data):
    lem = WordNetLemmatizer()
    lst1 = []
    for i in data:
        lem_words = lem.lemmatize(i)
        lst1.append(lem_words)
    return " ".join(lst1)

In [11]:
df['Research paper text'] = df['Research paper text'].apply(remove_spaces)
df['Research paper text'] = df['Research paper text'].apply(clean_text)
df['Research paper text'] = df['Research paper text'].apply(lemmatization)

In [12]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['Cancer'] = label_encoder.fit_transform(df['Cancer'])

In [13]:
df.head()

Unnamed: 0,Cancer,Research paper text
0,2,thyroid surgery child single institution osama...
1,2,adopted strategy used prior year based four ex...
2,2,coronary arterybypass grafting thrombosis muta...
3,2,solitary plasmacytoma skull uncommon clinical ...
4,2,study aimed investigate serum matrix metallopr...


In [14]:
from sklearn.model_selection import train_test_split

# Assuming df is your dataframe
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

class TextClassificationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(dataframe['Cancer'])

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['Research paper text']
        inputs = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token
MAX_LEN = 200
train_dataset = TextClassificationDataset(train_df, tokenizer, MAX_LEN)
test_dataset = TextClassificationDataset(test_df, tokenizer, MAX_LEN)

BATCH_SIZE = 8
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

In [17]:
import torch.nn as nn
from transformers import GPT2Model

class GPT2ForClassification(nn.Module):
    def __init__(self, num_labels):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained('gpt2-medium')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, -1]  
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [18]:
import torch
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

# Set up GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT2ForClassification(num_labels=3)
model.to(device)
WARMUP_STEPS = 500
MAX_SEQ_LEN = 40
# Hyperparameters
EPOCHS = 3
LEARNING_RATE = 2e-5
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
loss_function = CrossEntropyLoss()

# Training loop
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for batch in train_dataloader:
        batch_ids = batch['input_ids'].to(device)
        batch_mask = batch['attention_mask'].to(device)
        batch_labels = batch['label'].to(device)

        optimizer.zero_grad()

        logits = model(batch_ids, attention_mask=batch_mask)
        loss = loss_function(logits, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}")

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Epoch 1/3, Train Loss: 0.4414
Epoch 2/3, Train Loss: 0.0878
Epoch 3/3, Train Loss: 0.0594


In [19]:
from sklearn.metrics import accuracy_score

def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            batch_ids = batch['input_ids'].to(device)
            batch_mask = batch['attention_mask'].to(device)
            batch_labels = batch['label'].to(device)
            
            logits = model(batch_ids, attention_mask=batch_mask)
            preds = torch.argmax(logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

test_accuracy = evaluate(model, test_dataloader)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9769


In [20]:
def predict(text, model, tokenizer, device):
    # Set the model to evaluation mode
    model.eval()

    # Tokenize the input text
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,  # or any other max length you specified
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Get the model's predictions
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask)
    
    # Get the predicted class
    _, predicted_class = torch.max(logits, dim=1)
    
    # Convert the predicted class to its corresponding label
    label = label_encoder.inverse_transform(predicted_class.cpu().numpy())[0]  # Assuming you have the label encoder used during training
    
    return label

# Test the prediction function
text_query = "Thyroid surgery in children in a single institution from Osama Ibrahim Almosallama Ali Aseerib "
predicted_label = predict(text_query.lower(), model, tokenizer, device)
print(f"The predicted class for the text is: {predicted_label}")

The predicted class for the text is: Thyroid_Cancer
