In [1]:
import pandas as pd
import transformers as hf
from sklearn import model_selection, metrics
import datasets
import torch as th
from torch import nn
from torch.utils import data
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
th.manual_seed(42)
np.random.seed(28)

In [3]:
ckpt = "distilbert-base-uncased"
df = pd.read_csv("/data/crypto_reddit_sentiment.csv")
texts, labels = df["Comment Text"], df["Sentiment"].map({"Positive": 1, "Negative": 0})
x_train, x_test, y_train, y_test = model_selection.train_test_split(texts, labels, test_size=0.25)
tokenizer = hf.AutoTokenizer.from_pretrained(ckpt)

In [4]:
class Dataset(data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, padding="max_length", truncation=True)
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: th.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = th.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_ds = Dataset(x_train.to_list(), y_train.to_list(), tokenizer)
test_ds = Dataset(x_test.to_list(), y_test.to_list(), tokenizer)
# train_loader = data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
# test_loader = data.DataLoader(test_ds, batch_size=batch_size)

In [5]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, _, _ = metrics.precision_recall_fscore_support(labels, preds, average='binary')
    acc = metrics.accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall
    }

model = hf.AutoModelForSequenceClassification.from_pretrained(ckpt, num_labels=2)
trainer_args = hf.TrainingArguments(
    output_dir="./reddit_sentiment_model",
    learning_rate=5e-5,
    num_train_epochs=10,
    warmup_steps=20,
    overwrite_output_dir=True,
    weight_decay=0.8,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_dir="./coding_challenge",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
)
trainer = hf.Trainer(
    model=model,
    args=trainer_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)
trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall
1,0.6968,0.68963,0.560284,0.570093,0.792208
2,0.6835,0.666589,0.553191,0.55,1.0
3,0.6239,0.545039,0.780142,0.712963,1.0
4,0.4343,0.333568,0.879433,0.857143,0.935065
5,0.2303,0.262522,0.886525,0.917808,0.87013
6,0.0983,0.270791,0.886525,0.917808,0.87013
7,0.0433,0.27878,0.900709,0.92,0.896104
8,0.0236,0.281397,0.914894,0.933333,0.909091
9,0.0167,0.316301,0.907801,0.9,0.935065
10,0.0112,0.318306,0.907801,0.9,0.935065


***** Running Evaluation *****
  Num examples = 141
  Batch size = 64
Saving model checkpoint to ./reddit_sentiment_model/checkpoint-7
Configuration saved in ./reddit_sentiment_model/checkpoint-7/config.json
Model weights saved in ./reddit_sentiment_model/checkpoint-7/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 141
  Batch size = 64
Saving model checkpoint to ./reddit_sentiment_model/checkpoint-14
Configuration saved in ./reddit_sentiment_model/checkpoint-14/config.json
Model weights saved in ./reddit_sentiment_model/checkpoint-14/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 141
  Batch size = 64
Saving model checkpoint to ./reddit_sentiment_model/checkpoint-21
Configuration saved in ./reddit_sentiment_model/checkpoint-21/config.json
Model weights saved in ./reddit_sentiment_model/checkpoint-21/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 141
  Batch size = 64
Saving model checkpoint to ./reddit_sentiment_model/checkpoint-

TrainOutput(global_step=70, training_loss=0.28619674859302385, metrics={'train_runtime': 37.5731, 'train_samples_per_second': 112.048, 'train_steps_per_second': 1.863, 'total_flos': 557687748341760.0, 'train_loss': 0.28619674859302385, 'epoch': 10.0})