## Classification model training

3. Train a suitable NLP model for text classification (use Hugging Face)
4. Evaluate the model's performance on the validation set using appropriate metrics such as accuracy, precision, recall, and F1-score.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification, create_optimizer

In [4]:
# Load the dataset
data = pd.read_csv(
    '/Users/mariapazoliva/PycharmProjects/ArticlesClassifier/jupyter_notebooks/data_arxiv_articles/final_arxiv_articles.csv')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(data, test_size=0.1, random_state=42)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


def tokenize_function(examples):
    return tokenizer(examples['clean_abstract'], truncation=True, padding=True, max_length=512)


# Tokenize the text
train_encodings = tokenize_function(train_df.to_dict(orient='list'))
val_encodings = tokenize_function(val_df.to_dict(orient='list'))

# Convert categories into numerical labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['category'])
val_labels = label_encoder.transform(val_df['category'])

In [5]:
print("LabelEncoder classes:", label_encoder.classes_)

LabelEncoder classes: ['cs' 'econ' 'eess' 'math' 'phys' 'q-bio' 'q-fin' 'stat']


In [26]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# Adjust batch size according to your system's capabilities
batch_size = 16
train_dataset = train_dataset.shuffle(len(train_df)).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

In [27]:
# Load BERT model pre-trained on uncased text
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased',
                                                             num_labels=len(label_encoder.classes_))

# Setting up the optimizer and loss
num_train_steps = len(train_df) // batch_size * 3  # For 3 epochs
optimizer, _ = create_optimizer(init_lr=5e-5, num_train_steps=num_train_steps, num_warmup_steps=0)

# Compile the model explicitly specifying the loss function
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model
history = model.fit(train_dataset, epochs=3, validation_data=val_dataset)
# Save the model to a directory
model.save_pretrained('base_model')

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
# Predict on validation dataset
val_predictions = model.predict(val_dataset)

# Get the class with the highest probability for each instance
val_preds = np.argmax(val_predictions.logits, axis=1)



In [29]:
# Calculate accuracy
accuracy = accuracy_score(val_labels, val_preds)
print(f"Accuracy: {accuracy:.4f}")

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# For a more detailed report, use classification_report
report = classification_report(val_labels, val_preds, target_names=label_encoder.classes_)
print(report)

Accuracy: 0.6375
Precision: 0.6171
Recall: 0.6375
F1-Score: 0.6179
              precision    recall  f1-score   support

          cs       0.33      0.12      0.18         8
        econ       0.88      0.88      0.88         8
        eess       0.62      0.83      0.71        12
        math       0.50      0.71      0.59         7
        phys       0.64      0.58      0.61        12
       q-bio       0.73      0.79      0.76        14
       q-fin       0.86      0.86      0.86         7
        stat       0.40      0.33      0.36        12

    accuracy                           0.64        80
   macro avg       0.62      0.64      0.62        80
weighted avg       0.62      0.64      0.62        80

