In [1]:
%load_ext autoreload
%autoreload 2

In [80]:
from datasets import load_dataset
data_files = {"train": "data/training.csv", "test": "data/test.csv"}
dataset = load_dataset("csv", data_files=data_files)

In [81]:
dataset["train"] = dataset["train"].shuffle(seed=42).select([i for i in list(range(100))])

In [82]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [83]:
label_map = {"negative": 0, "neutral": 1, "positive": 2, None: None}

def preprocess_function(examples):
    # Tokenize
    tokenized = tokenizer(examples["sentence"], padding="max_length", truncation=True)
    # Map labels
    tokenized["label"] = [label_map[label] for label in examples["label"]]
    return tokenized

dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map: 100%|██████████| 100/100 [00:00<00:00, 1140.94 examples/s]


In [84]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [85]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
import evaluate
import numpy as np
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    print(predictions)
    return metric.compute(predictions=predictions, references=labels)

In [90]:
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-3000-samples"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=False,
   label_names=["label"]
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=dataset["train"],
   eval_dataset=dataset["train"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=14, training_loss=1.0432630266462053, metrics={'train_runtime': 262.0548, 'train_samples_per_second': 0.763, 'train_steps_per_second': 0.053, 'total_flos': 26493952204800.0, 'train_loss': 1.0432630266462053, 'epoch': 2.0})

In [None]:
trainer.evaluate()

{'eval_runtime': 37.0887,
 'eval_samples_per_second': 2.696,
 'eval_steps_per_second': 0.189,
 'epoch': 2.0}

In [42]:
from data.SentimentAnalysisDataset import get_sentiment_dataset, create_collate_fn
train = get_sentiment_dataset("train", tokenizer)
test = get_sentiment_dataset("test", tokenizer)

collate_fn = create_collate_fn(tokenizer)

train_loader = DataLoader(train, batch_size=16, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test, batch_size=16, shuffle=False, collate_fn=collate_fn)

In [44]:
def predict_dataset(model, loader):
    predictions = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    with torch.no_grad():
        for batch in tqdm(loader):
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            probs = softmax(outputs.logits, dim=1)
            preds = torch.argmax(probs, dim=1).cpu().numpy()
            predictions.extend(preds)

    return predictions

# Get predictions
preds = predict_dataset(model, test_loader)

100%|██████████| 747/747 [07:52<00:00,  1.58it/s]


In [47]:
mapping={0:"negative", 1:"neutral", 2:"positive"}
preds_mapped = [mapping[p] for p in preds]

In [48]:
preds_mapped

['neutral',
 'neutral',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'negative',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'positive',
 'negative',
 'negative',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'negative',
 'neutral',
 'positive',
 'positive',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'negative',
 'positive',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'negative',
 'negative',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'negative',
 'neu

In [55]:
from submissions.SubmissionCreation import create_submission
create_submission(preds_mapped)