# Requirements

In [None]:
!pip install --quiet datasets evaluate scikit-learn torch transformers pandas wandb

In [None]:
import torch
import wandb
import os

import pandas as pd

from pathlib import Path
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from evaluate import load

In [None]:
dataset_type = 'TASK' # e.g. text_classification
dataset_name = 'DATASET_NAME' # e.g. offenseval
augment = True
sample_ratio = 1.0
# TODO: add an argument to support augmentation for some classes only

datasets_path = Path('..')
dataset_path = datasets_path / dataset_type / dataset_name
os.environ['WANDB_PROJECT'] = f'paraphrase_augmentation_{dataset_type}_{dataset_name}'
current_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

if augment:
  run_name = f'augmented_{sample_ratio}_{current_time}'
else:
  run_name = f'original_{sample_ratio}_{current_time}'

In [None]:
training_args_dict = {
    'output_dir': 'output',
    'evaluation_strategy': 'epoch',
    'per_device_train_batch_size': 32,
    'per_device_eval_batch_size': 32,
    'num_train_epochs': 8,
    'save_strategy': 'epoch',
    'learning_rate': 1e-05,
    'report_to': 'wandb',
    'run_name': run_name,
}

# WANDB

In [None]:
wandb.login()

# Load dataset

In [None]:
raw_train_df = pd.read_csv(dataset_path / f'{dataset_name}_train.csv')
test_df = pd.read_csv(dataset_path / f'{dataset_name}_test.csv')

raw_train_df = raw_train_df.sample(int(len(raw_train_df) * sample_ratio), random_state = 42)

In [None]:
le = LabelEncoder()

raw_train_df['label'] = le.fit_transform(raw_train_df['label'])
test_df['label'] = le.transform(test_df['label'])

In [None]:
train_df = pd.DataFrame()

if augment:
  train_texts = raw_train_df['text'].to_list() + raw_train_df['paraphrase'].to_list()
  y_train = raw_train_df['label'].to_list() * 2
else:
  train_texts = raw_train_df['text'].to_list()
  y_train = raw_train_df['label'].to_list()

train_df['text'] = train_texts
train_df['labels'] = y_train

In [None]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

test_ds = test_ds.rename_column('label', 'labels')

dataset = DatasetDict({
    'train': train_ds,
    'test': test_ds
})

# Train Model

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], max_length = 512, padding = 'max_length', truncation = True)

In [None]:
def compute_metrics(eval_pred):
    accuracy = load('accuracy')
    f1 = load('f1')
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    return {'accuracy': accuracy.compute(predictions = predictions, references = labels)['accuracy'],
            'f1': f1.compute(predictions = predictions, references = labels, average = 'macro')['f1']}

In [None]:
checkpoint = 'dbmdz/distilbert-base-turkish-cased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = len(le.classes_))

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched = True)

In [None]:
training_args = TrainingArguments(**training_args_dict)

In [None]:
trainer = Trainer(model,
                  training_args,
                  train_dataset = tokenized_dataset['train'],
                  eval_dataset = tokenized_dataset['test'],
                  compute_metrics = compute_metrics)

In [None]:
history = trainer.train()
wandb.finish()