In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Preprocessing Data

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset
import torch as pt
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
for id in tokenizer.all_special_ids:
    print(tokenizer.convert_ids_to_tokens(id))

In [None]:
for id in tokenizer.encode("geopolitical"):
    print(tokenizer.convert_ids_to_tokens(id), end=' ')

In [None]:
data_files = {'train':'train.csv'}

In [None]:
dataset = load_dataset('/kaggle/input/nlp-getting-started', data_files=data_files)['train']

In [None]:
dataset

In [None]:
max_len = 128

In [None]:
def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_len)

    encoding["labels"] = examples['target']

    return encoding

In [None]:
encoded_train_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset.column_names)

In [None]:
encoded_train_dataset = encoded_train_dataset.train_test_split(.1)

In [None]:
encoded_train_dataset

In [None]:
tokenizer.decode(encoded_train_dataset['train'][123]['input_ids'])

In [None]:
encoded_train_dataset['train'][:20]['labels']

## Define model

In [None]:
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

In [None]:
def classify(text):
    with pt.no_grad():
        encoding = tokenizer(text, return_tensors="pt")
        encoding = {k: v.to(model.device) for k,v in encoding.items()}

        outputs = model(**encoding)

        return pt.argmax(outputs.logits, axis=1)

## Finetune the model

In [None]:
batch_size = 8
metric_name = 'f1'
epoches = 3

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-disaster-english-tweets",
    evaluation_strategy = "steps",
    save_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_steps=epoches*100,
    save_steps=epoches*100,
    num_train_epochs=epoches,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to="none"
)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    
    y_pred = np.argmax(preds, axis=1)
    y_true = p.label_ids
    
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset['train'],
    eval_dataset=encoded_train_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

## Evaluation

In [None]:
trainer.evaluate()

In [None]:
classify("There are new reports on the earthquake near the city of Kansas.")

In [None]:
import pandas as pd

In [None]:
eval_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
predictions_df = pd.DataFrame()

In [None]:
for i, text in zip(eval_df['id'], eval_df['text']):
    y_hat = int(classify(text))
    r = [i, y_hat]
    predictions_df = pd.concat([predictions_df, pd.DataFrame(np.array(r)[None,:], columns=['id', 'target'])])

In [None]:
predictions_df.target = predictions_df.target.astype(int)
predictions_df.id = predictions_df.id.astype(int)

In [None]:
print('Predictions')
predictions_df.head(20)

In [None]:
predictions_df.to_csv('submission.csv', index=False)
print('Done!')