#BERT

In [17]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [18]:
data = pd.read_csv('train.csv')

data

Unnamed: 0,abstract,category
0,"In the last four years, daily deals have eme...",Applied
1,We propose a novel approach for density esti...,ML
2,"In this research, two-state Markov switching...",Applied
3,This article considers the estimation of the...,Applied
4,Markowitz's celebrated mean--variance portfo...,Applied
...,...,...
59,The use of Reinforcement Learning in real-wo...,ML
60,The paper introduces a penalized matrix esti...,ML
61,Cross-validation (CV) is widely used for tun...,Applied
62,We analyze the results of the German Team Ha...,Applied


In [19]:
import nltk
import string

# Download stopwords from NLTK
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
import string

# Download stopwords from NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Remove numbers
    filtered_tokens = [word for word in filtered_tokens if not word.isdigit()]

    # Stemming
    stemmer = SnowballStemmer('english')
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    # Remove other non-alphanumeric tokens
    preprocessed_tokens = [word for word in lemmatized_tokens if word.isalnum()]

    # Convert tokens back to string
    preprocessed_text = ' '.join(preprocessed_tokens)

    return preprocessed_text

data['abstract'] = data['abstract'].apply(preprocess_text)
data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,abstract,category
0,last four year daili deal emerg nowher becom m...,Applied
1,propos novel approach densiti estim exponenti ...,ML
2,research twostat markov switch model propos st...,Applied
3,articl consid estim number sever disabl peopl ...,Applied
4,markowitz celebr meanvari portfolio optim theo...,Applied
...,...,...
59,use reinforc learn realworld scenario strong l...,ML
60,paper introduc penal matrix estim procedur aim...,ML
61,crossvalid cv wide use tune model respect user...,Applied
62,analyz result german team handbal bundesliga t...,Applied


In [20]:
abstracts = data['abstract'].tolist()
labels = data['category'].tolist()

In [21]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(abstracts, labels, test_size=0.1, random_state=42)

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_inputs = tokenizer(train_inputs, padding=True, truncation=True, return_tensors="pt")
val_inputs = tokenizer(val_inputs, padding=True, truncation=True, return_tensors="pt")

In [23]:
# Define Dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        label = self.labels[idx]
        label_mapping = {'ML': 0, 'Applied': 1}
        item['labels'] = torch.tensor(label_mapping[label])
        return item

    def __len__(self):
        return len(self.labels)


In [24]:
train_dataset = TextClassificationDataset(train_inputs, train_labels)
val_dataset = TextClassificationDataset(val_inputs, val_labels)

In [25]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [26]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Epoch {}".format(epoch+1)):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print("Training Loss:", total_loss)

    model.eval()
    val_accuracy = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_accuracy += torch.sum(preds == labels).item()
    print("Validation Accuracy:", val_accuracy / len(val_dataset))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 1: 100%|██████████| 8/8 [00:03<00:00,  2.49it/s]


Training Loss: 5.731084644794464
Validation Accuracy: 0.42857142857142855


Epoch 2: 100%|██████████| 8/8 [00:03<00:00,  2.47it/s]


Training Loss: 5.4786253571510315
Validation Accuracy: 0.8571428571428571


Epoch 3: 100%|██████████| 8/8 [00:03<00:00,  2.41it/s]


Training Loss: 4.634545475244522
Validation Accuracy: 0.42857142857142855


Epoch 4: 100%|██████████| 8/8 [00:03<00:00,  2.39it/s]


Training Loss: 4.850523114204407
Validation Accuracy: 0.7142857142857143


Epoch 5: 100%|██████████| 8/8 [00:03<00:00,  2.36it/s]


Training Loss: 3.4058632254600525
Validation Accuracy: 0.7142857142857143


Epoch 6: 100%|██████████| 8/8 [00:03<00:00,  2.35it/s]


Training Loss: 3.2143320590257645
Validation Accuracy: 0.7142857142857143


Epoch 7: 100%|██████████| 8/8 [00:03<00:00,  2.39it/s]


Training Loss: 1.6862281784415245
Validation Accuracy: 0.42857142857142855


Epoch 8: 100%|██████████| 8/8 [00:03<00:00,  2.44it/s]


Training Loss: 0.9242822378873825
Validation Accuracy: 0.7142857142857143


Epoch 9: 100%|██████████| 8/8 [00:03<00:00,  2.47it/s]


Training Loss: 0.7412024885416031
Validation Accuracy: 0.7142857142857143


Epoch 10: 100%|██████████| 8/8 [00:03<00:00,  2.51it/s]


Training Loss: 0.4533895906060934
Validation Accuracy: 0.5714285714285714


Epoch 11: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Training Loss: 0.6458893399685621
Validation Accuracy: 0.7142857142857143


Epoch 12: 100%|██████████| 8/8 [00:03<00:00,  2.55it/s]


Training Loss: 0.1746988669037819
Validation Accuracy: 0.7142857142857143


Epoch 13: 100%|██████████| 8/8 [00:03<00:00,  2.57it/s]


Training Loss: 0.12027358263731003
Validation Accuracy: 0.7142857142857143


Epoch 14: 100%|██████████| 8/8 [00:03<00:00,  2.58it/s]


Training Loss: 0.08355020638555288
Validation Accuracy: 0.7142857142857143


Epoch 15: 100%|██████████| 8/8 [00:03<00:00,  2.61it/s]


Training Loss: 0.07767543569207191
Validation Accuracy: 0.7142857142857143


Epoch 16: 100%|██████████| 8/8 [00:03<00:00,  2.61it/s]


Training Loss: 0.053648859495297074
Validation Accuracy: 0.7142857142857143


Epoch 17: 100%|██████████| 8/8 [00:03<00:00,  2.62it/s]


Training Loss: 0.05193233955651522
Validation Accuracy: 0.7142857142857143


Epoch 18: 100%|██████████| 8/8 [00:03<00:00,  2.62it/s]


Training Loss: 0.04113108338788152
Validation Accuracy: 0.7142857142857143


Epoch 19: 100%|██████████| 8/8 [00:03<00:00,  2.62it/s]


Training Loss: 0.04379835142754018
Validation Accuracy: 0.7142857142857143


Epoch 20: 100%|██████████| 8/8 [00:03<00:00,  2.62it/s]


Training Loss: 0.033456545788794756
Validation Accuracy: 0.7142857142857143


In [28]:
test_df = pd.read_csv('test.csv')

In [29]:
test_df['abstract'] = test_df['abstract'].apply(preprocess_text)
test_df

Unnamed: 0,abstract
0,probabilist princip compon analysi ppca seek l...
1,sensorbas degrad signal measur accumul damag e...
2,generic identif problem decid whether stochast...
3,introduc new class lower bound log partit func...
4,regular power techniqu extract use inform nois...
...,...
3931,goal crossdomain object match cdom find corres...
3932,sequenti predict problem imit learn futur obse...
3933,minim relat inertia statist group respect iner...
3934,u presidenti elect cycl mark debut internetbas...


In [30]:
# Tokenize input text
test_abstracts = test_df['abstract'].tolist()
encoded_inputs = tokenizer(test_abstracts, padding=True, truncation=True, return_tensors="pt")

In [31]:
class TestTextClassificationDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


test_dataset = TestTextClassificationDataset(encoded_inputs)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [32]:
model.eval()
model.to('cuda')
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model(input_ids.to('cuda'), attention_mask=attention_mask.to('cuda'))
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Convert numerical predictions back to category labels
label_mapping = {0: 'ML', 1: 'Applied'}  # Adjust according to your label mapping
predicted_categories = [label_mapping[pred] for pred in predictions]

# Add predictions to the test DataFrame
df = pd.DataFrame(predicted_categories, columns=['prediction'])

# Save or further analyze the test DataFrame with predictions
df.to_csv("output.csv", index=False)

df

Unnamed: 0,prediction
0,ML
1,Applied
2,ML
3,ML
4,ML
...,...
3931,ML
3932,Applied
3933,Applied
3934,Applied


This code has 75% accuracy on test data.