In [4]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolc

In [2]:
!pip install 'transformers[torch]'


Collecting transformers[torch]
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from transformers[torch])
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers[torch])
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers[torch])
  Downloading numpy-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pyyaml>=5.1 (from transformers[torch])
  Using cached PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers[torch])
  Using cached regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting requests (from transformers[torch])
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Using

In [6]:
!pip install pandas

Collecting pandas
  Using cached pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Downloading pytz-2025.1-py2.py3-none-any.whl (507 kB)
Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.1 tzdata-2025.1


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Summarizer class
class Summarizer:
    def __init__(self):
        self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
        self.model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)

    def summarize(self, text, max_length=512):
        inputs = self.tokenizer(text, return_tensors='pt', max_length=1024, truncation=True).to(device)
        summary_ids = self.model.generate(inputs['input_ids'], max_length=max_length, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, summarizer, max_length):
        # self.texts = [summarizer.summarize(text) for text in texts]
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load data
def load_data(file_path):
    df = pd.read_csv(file_path)
    texts = df['text'].tolist()
    labels = df['category'].tolist()

    # Encode categorical labels
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)

    return texts, labels, label_encoder

# Tokenizer and Summarizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# summarizer = Summarizer()

In [21]:
file_path = 'data/output_chunk_4.csv'  # Replace with your dataset file
texts, labels, label_encoder = load_data(file_path)

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

train_dataset = TextDataset(train_texts, train_labels, tokenizer, summarizer, max_length=512)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, summarizer, max_length=512)


In [22]:

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels))).to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save at each epoch
    save_total_limit=1,  # Keep only the best model
    load_best_model_at_end=True,  # Load best model at the end
    metric_for_best_model='eval_loss',  # Use evaluation loss to determine best model
    greater_is_better=False,  # Lower eval loss is better
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,1.1335,1.264788
2,0.8659,0.786195
3,0.4625,0.729385
4,0.3952,0.832665
5,0.2533,0.848054
6,0.1031,0.950888
7,0.1253,1.005579
8,0.0111,1.186895
9,0.2097,1.162924
10,0.1742,1.255953


TrainOutput(global_step=18000, training_loss=0.10564262107704255, metrics={'train_runtime': 10240.3256, 'train_samples_per_second': 28.124, 'train_steps_per_second': 1.758, 'total_flos': 7.5786869735424e+16, 'train_loss': 0.10564262107704255, 'epoch': 50.0})

In [24]:
# Save the model and label encoder
model.save_pretrained("./bert_text_classification-50epoch-no-summary")
tokenizer.save_pretrained("./bert_text_classification-50epoch-no-summary")

# Save label encoder classes
pd.Series(label_encoder.classes_).to_csv("./bert_text_classification/label_classes.csv", index=False)

In [25]:
# Generate predictions and classification report
preds = trainer.predict(val_dataset)
pred_labels = preds.predictions.argmax(axis=1)
report = classification_report(val_labels, pred_labels, target_names=label_encoder.classes_)
print(report)

with open('./bert_text_classification/validation_classification_report.txt', 'w') as f:
    f.write(report)

print("Model training complete and saved to './bert_text_classification'")

               precision    recall  f1-score   support

         arts       0.87      0.81      0.84        85
        crime       0.85      0.88      0.87        86
     disaster       0.93      0.68      0.79        78
      economy       0.72      0.69      0.70        86
    education       0.90      0.91      0.91        79
environmental       0.86      0.92      0.89        73
       health       0.86      0.89      0.87        81
humanInterest       0.73      0.72      0.72        78
       labour       0.87      0.81      0.83        72
    lifestyle       0.76      0.89      0.82        90
        other       0.31      0.30      0.30        71
     politics       0.78      0.80      0.79        83
     religion       0.71      0.74      0.73        86
      science       0.80      0.89      0.84        74
       social       0.94      0.94      0.94        80
        sport       0.93      0.85      0.89        98
       unrest       0.72      0.78      0.75        76
      wea

## Bigger model 

In [4]:
# Tokenizer# Tokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

In [5]:
file_path = 'data/output_chunk_4.csv'  # Replace with your dataset file
texts, labels, label_encoder = load_data(file_path)
summarizer=None

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

train_dataset = TextDataset(train_texts, train_labels, tokenizer, summarizer, max_length=512)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, summarizer, max_length=512)


In [7]:
# Load model
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=len(set(labels)))
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    fp16=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:

# Train
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.9807,2.93586
2,2.9857,2.942543
3,3.0949,2.903822
4,2.8285,2.903619
5,3.0432,2.925933
6,2.907,2.937465
7,2.8545,2.910893


TrainOutput(global_step=40320, training_loss=2.9469601040794737, metrics={'train_runtime': 4665.3213, 'train_samples_per_second': 61.732, 'train_steps_per_second': 61.732, 'total_flos': 3.757750392324096e+16, 'train_loss': 2.9469601040794737, 'epoch': 7.0})

In [10]:



# Save the model and tokenizer
model.save_pretrained("./roberta_text_classification")
tokenizer.save_pretrained("./roberta_text_classification")
pd.Series(label_encoder.classes_).to_csv("./roberta_text_classification/label_classes.csv", index=False)

# Generate predictions and classification report
preds = trainer.predict(val_dataset)
pred_labels = preds.predictions.argmax(axis=1)
report = classification_report(val_labels, pred_labels, target_names=label_encoder.classes_)
print(report)

with open('./roberta_text_classification/test_classification_report.txt', 'w') as f:
    f.write(report)

print("Model training complete and saved to './roberta_text_classification'")

               precision    recall  f1-score   support

         arts       0.00      0.00      0.00        69
        crime       0.00      0.00      0.00        79
     disaster       0.00      0.00      0.00        77
      economy       0.00      0.00      0.00        84
    education       0.00      0.00      0.00        96
environmental       0.00      0.00      0.00        88
       health       0.00      0.00      0.00        63
humanInterest       0.00      0.00      0.00        77
       labour       0.00      0.00      0.00        74
    lifestyle       0.00      0.00      0.00        87
        other       0.00      0.00      0.00        77
     politics       0.00      0.00      0.00        86
     religion       0.00      0.00      0.00        87
      science       0.00      0.00      0.00        82
       social       0.06      1.00      0.11        83
        sport       0.00      0.00      0.00        86
       unrest       0.00      0.00      0.00        70
      wea

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
b