In [None]:
!pip install -q transformers datasets
!pip install accelerate -U -q
!pip install wandb -q

In [None]:
!rm -rf ./outs ./wandb

In [None]:
from datasets import Dataset
import pandas as pd
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.optim import AdamW
import torch
import numpy as np
import wandb
import os
from multiprocessing import Pool

# w&b api key = b52c0b8bafc7f2f71a0cbd30c1b2d736a881787f
wandb.login(key="b52c0b8bafc7f2f71a0cbd30c1b2d736a881787f")
os.environ["WANDB_PROJECT"] = "HLT-project"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

### Dataset

In [None]:
# Load dataset from CSV file
df = pd.read_csv("cleaned_train.csv")
# Define tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text and keyword, and concatenate them
def tokenize_data(example):
    text = example["cleaned_text"]
    keyword = example["cleaned_keyword"]
    obj = tokenizer(text, keyword, truncation=True, padding="max_length", max_length=32)
    obj["labels"] = example["target"]
    return obj

tokenized_data = []
for index, row in df.iterrows():
    tokenized_data.append(tokenize_data(row))

# Apply tokenization to all examples in the dataset
tokenized_dataset = Dataset.from_list(tokenized_data)
dataset = tokenized_dataset.train_test_split(test_size=0.2, shuffle=True)

### Model

In [None]:
# Define evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return { "accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1 }

def objective(config):
    # Define training arguments
    training_args = TrainingArguments(
        num_train_epochs=config.epochs,
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size,
        load_best_model_at_end=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        output_dir="./outs",
        report_to="wandb",
        logging_steps=1,
    )

    # Load pre-trained BERT model for sequence classification
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    optimizer = AdamW(model.parameters(), lr=config.lr)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, None),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    # Search hyperparameters
    trainer.train()


In [None]:
# Example sweep configuration
sweep_configuration = {
    "method": "random",
    "name": "sweep",
    "metric": {"goal": "maximize", "name": "eval/f1"},
    "parameters": {
        "batch_size": {"values": [8, 16, 32, 64]},
        "epochs": {"values": [3]},
        "lr": {"max": 1e-4, "min": 5e-5},
    },
}

def main():
    wandb.init(project="my-first-sweep")
    objective(wandb.config)

sweep_id = wandb.sweep(sweep=sweep_configuration, project="HLT-project")

wandb.agent(sweep_id, function=main, count=10)

### Test

In [None]:
def tokenize_data_test(example):
    text = example["cleaned_text"]
    keyword = example["cleaned_keyword"]
    obj = tokenizer(text, keyword, truncation=True, padding="max_length", max_length=32)
    return obj

In [None]:
# Load dataset from CSV file
df_test = pd.read_csv("cleaned_test.csv")

del df_test['keyword']
del df_test['location']
del df_test['text']

tokenized_data_test = []
for index, row in df_test.iterrows():
    tokenized_data_test.append(tokenize_data_test(row))
print(tokenized_data_test[0])

del df_test['cleaned_text']
del df_test['cleaned_keyword']

In [None]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
df_test["id"]

In [None]:
tokenized_data_test[10]

In [None]:
trainer.predict(tokenized_data_test)

In [None]:
for i in range(len(prediction_test)):
    out = prediction_test[0][i].argmax(axis=-1)
    df_test.loc[i, "target"] = out
prediction_test[0][0]

In [None]:
for i in df_test["target"]:
    print(i == 0.0)