In [14]:
!pip install evaluate




[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import torch
print(torch.cuda.is_available())   # True if CUDA is available
print(torch.cuda.current_device())
print(torch.cuda.device_count()) # Number of GPUs available

In [1]:
from datasets import load_dataset


# Fixed random seed for split reproducibility
split_seed = 42



  from .autonotebook import tqdm as notebook_tqdm


#### 1. Load your dataset

In [None]:

train_path="../data/train.csv"
test_dataset = load_dataset("csv", data_files=train_path)
train_dataset = test_dataset.rename_column("rule_violation", "label").rename_column("body", "text")

In [2]:

train_path="../data/train.csv"
dataset = load_dataset("csv", data_files=train_path)
dataset = dataset["train"].train_test_split(test_size=0.2, seed=split_seed)
train_dataset = dataset["train"]
test_dataset = dataset["test"]


In [3]:
train_dataset = train_dataset.rename_column("rule_violation", "label").rename_column("body", "text")
test_dataset = test_dataset.rename_column("rule_violation", "label").rename_column("body", "text")


In [4]:
print(train_dataset)
print(train_dataset[0])
print(train_dataset.column_names)



Dataset({
    features: ['row_id', 'text', 'rule', 'subreddit', 'positive_example_1', 'positive_example_2', 'negative_example_1', 'negative_example_2', 'label'],
    num_rows: 1623
})
{'row_id': 1925, 'text': "unethical but... make a SS# but state name and address and all perfectly.  If the IRS asks, just say that's the number you got, let the IRS sort if out for you.", 'rule': 'No legal advice: Do not offer or request legal advice.', 'subreddit': 'personalfinance', 'positive_example_1': 'Dear dumbass, they stole $1700 dollars from him. I would have that person arrested regardless of who they are and what the money was used for. They committed fraud and identify theft against their own child, the only way someone with this kind of mentality will learn is by getting charged with a crime. ', 'positive_example_2': "If she's been representing herself as a guest, and she wouldn't be permitted as your subtenant, then the way forward is clear, since she's not a tenant. You can evict her immed

#### 2. Load a Tokenizer


In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 1623/1623 [00:01<00:00, 1232.24 examples/s]
Map: 100%|██████████| 406/406 [00:00<00:00, 1323.08 examples/s]


In [7]:
print(train_dataset_tokenized[0])

{'row_id': 1925, 'text': "unethical but... make a SS# but state name and address and all perfectly.  If the IRS asks, just say that's the number you got, let the IRS sort if out for you.", 'rule': 'No legal advice: Do not offer or request legal advice.', 'subreddit': 'personalfinance', 'positive_example_1': 'Dear dumbass, they stole $1700 dollars from him. I would have that person arrested regardless of who they are and what the money was used for. They committed fraud and identify theft against their own child, the only way someone with this kind of mentality will learn is by getting charged with a crime. ', 'positive_example_2': "If she's been representing herself as a guest, and she wouldn't be permitted as your subtenant, then the way forward is clear, since she's not a tenant. You can evict her immediately, or perhaps with a courtesy 3 day notice, and if she doesn't clear out you can go to court to have an eviction ordered.", 'negative_example_1': 'Why not just ask for the gun and

#### 3. Load a Pre-trained Model

In [8]:
from transformers import BertForSequenceClassification

# Load BERT for classification (binary here, adjust num_labels as needed)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)
print(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

#### 4. Training Preparation

In [12]:
from transformers import TrainingArguments, Trainer
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=test_dataset_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



  trainer = Trainer(


#### 5. Train the Model

In [13]:

trainer.train()



Step,Training Loss
50,0.6715
100,0.5875
150,0.5526
200,0.513
250,0.3864
300,0.3978
350,0.3436
400,0.3629
450,0.2282
500,0.2418




TrainOutput(global_step=609, training_loss=0.40186620110948684, metrics={'train_runtime': 3456.1214, 'train_samples_per_second': 1.409, 'train_steps_per_second': 0.176, 'total_flos': 320271932136960.0, 'train_loss': 0.40186620110948684, 'epoch': 3.0})

#### 6. Evaluate the Model

In [None]:
print(trainer.evaluate())

#### 7. Make Predictions for test file