In [55]:
!pip install kaggle datasets transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [56]:
from pathlib import Path
import kaggle, zipfile

competition = 'nlp-getting-started'
path = Path(f'../data/{competition}')
kaggle.api.competition_download_cli(competition)
zipfile.ZipFile(f'{competition}.zip').extractall(path)

nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)


In [57]:
import pandas as pd

train_df = pd.read_csv(f"{path}/train.csv")
eval_df = pd.read_csv(f"{path}/test.csv")

In [58]:
location_counts = train_df.location.value_counts()

In [59]:
from sklearn.metrics import f1_score
import numpy as np

def f1(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'f1': f1_score(labels, predictions)}

In [60]:
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_nm = 'cardiffnlp/twitter-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_nm)

def tok_func(x): return tokenizer(x["input"])

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/cardiffnlp/twitter-roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3a144066ef86f4534c95e09dea42174cee2812491c3559fe86f179a18c514f6a.85077ed383e73e121e0fd562c01aab7bddf297cbe6009eee09ce660ab317ca08
Model config RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute"

In [61]:
def build_dataset(df, format_func):
    df['input'] = df.apply(format_func, axis=1)
    ds = Dataset.from_pandas(df)
    ds = ds.map(tok_func, batched=True, remove_columns=["id", "location", "input", "keyword", "text"])
    return ds

def build_train_dataset(df, format_func):
    ds = build_dataset(df, format_func).rename_column('target', 'label')
    dds = ds.train_test_split(0.25, seed=42)
    return dds

In [62]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def clean(text):
    if text is np.nan:
        return tokenizer.unk_token
    return text.replace('%20', ' ')

def format_input(r): 
    text = preprocess(clean(r['text']))
    return f"KW: {clean(r['keyword'])}; LOC: {r['location']}; TEXT1: {text}"

In [63]:
id2label = {0: "NO DISASTER", 1: "DISASTER"}
label2id = {"NO DISASTER": 0, "DISASTER": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=2, id2label=id2label, label2id=label2id)

loading configuration file https://huggingface.co/cardiffnlp/twitter-roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3a144066ef86f4534c95e09dea42174cee2812491c3559fe86f179a18c514f6a.85077ed383e73e121e0fd562c01aab7bddf297cbe6009eee09ce660ab317ca08
Model config RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NO DISASTER",
    "1": "DISASTER"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "DISASTER": 1,
    "NO DISASTER": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "pos

In [64]:
from transformers import TrainingArguments, Trainer

bs = 128
epochs = 3
lr = 1e-5

In [65]:
args = TrainingArguments('output', 
                         learning_rate=lr, 
                         warmup_ratio=0.1, 
                         lr_scheduler_type='cosine',
                         optim="adamw_torch",
                         fp16=True, 
                         evaluation_strategy='epoch',
                         save_strategy='epoch',
                         per_device_train_batch_size=bs, 
                         per_device_eval_batch_size=bs*2, 
                         num_train_epochs=epochs, 
                         weight_decay=0.01, 
                         report_to='none',
                         load_best_model_at_end=True)

PyTorch: setting up devices


In [66]:
dds = build_train_dataset(train_df, format_input)

  0%|          | 0/8 [00:00<?, ?ba/s]

In [67]:
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'], tokenizer=tokenizer, compute_metrics=f1)

Using cuda_amp half precision backend


In [68]:
trainer.train()

***** Running training *****
  Num examples = 5709
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 135


Epoch,Training Loss,Validation Loss,F1
1,No log,0.463037,0.70767
2,No log,0.397057,0.797272
3,No log,0.389073,0.795322


***** Running Evaluation *****
  Num examples = 1904
  Batch size = 256
Saving model checkpoint to output/checkpoint-45
Configuration saved in output/checkpoint-45/config.json
Model weights saved in output/checkpoint-45/pytorch_model.bin
tokenizer config file saved in output/checkpoint-45/tokenizer_config.json
Special tokens file saved in output/checkpoint-45/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1904
  Batch size = 256
Saving model checkpoint to output/checkpoint-90
Configuration saved in output/checkpoint-90/config.json
Model weights saved in output/checkpoint-90/pytorch_model.bin
tokenizer config file saved in output/checkpoint-90/tokenizer_config.json
Special tokens file saved in output/checkpoint-90/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1904
  Batch size = 256
Saving model checkpoint to output/checkpoint-135
Configuration saved in output/checkpoint-135/config.json
Model weights saved in output/checkpoint-135/pytor

TrainOutput(global_step=135, training_loss=0.4696450410065828, metrics={'train_runtime': 183.5497, 'train_samples_per_second': 93.31, 'train_steps_per_second': 0.735, 'total_flos': 607649843466120.0, 'train_loss': 0.4696450410065828, 'epoch': 3.0})

In [69]:
eval_ds = build_dataset(eval_df, format_input)
predictions = trainer.predict(eval_ds)

  0%|          | 0/4 [00:00<?, ?ba/s]

***** Running Prediction *****
  Num examples = 3263
  Batch size = 256


In [70]:
output = np.argmax(predictions.predictions, axis=1)
submission_df = pd.DataFrame({'id': eval_df['id'], 'target': output})

In [71]:
def submit(df, message):
    df.to_csv('submission.csv', index=False)
    kaggle.api.competition_submit('submission.csv', message, competition)

In [73]:
submit(submission_df, "/shrug")

100%|██████████| 22.2k/22.2k [00:00<00:00, 43.4kB/s]


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'Date': 'Sat, 18 Nov 2023 17:14:20 GMT', 'Access-Control-Allow-Credentials': 'true', 'Set-Cookie': 'ka_sessionid=f2eda0232d48474e1ebf1a838d25752d; max-age=2626560; path=/, GCLB=CKHotNXJh4rn0QE; path=/; HttpOnly', 'Transfer-Encoding': 'chunked', 'Vary': 'Accept-Encoding', 'Turbolinks-Location': 'https://www.kaggle.com/api/v1/competitions/submissions/submit/nlp-getting-started', 'X-Kaggle-MillisecondsElapsed': '57', 'X-Kaggle-RequestId': '3204ea8ad1d8453ea74231d19e2b8ff5', 'X-Kaggle-ApiVersion': '1.5.15', 'X-Frame-Options': 'SAMEORIGIN', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload', 'Content-Security-Policy': "object-src 'none'; script-src 'nonce-zFKfuKjG/dYvsQfGYPkkpg==' 'report-sample' 'unsafe-inline' 'unsafe-eval' 'strict-dynamic' https: http:; base-uri 'none'; report-uri https://csp.withgoogle.com/csp/kaggle/20201130; frame-src 'self' https://www.kaggleusercontent.com https://www.youtube.com/embed/ https://polygraph-cool.github.io https://www.google.com/recaptcha/ https://www.docdroid.com https://www.docdroid.net https://kaggle-static.storage.googleapis.com https://kkb-production.jupyter-proxy.kaggle.net https://kkb-production.firebaseapp.com https://kaggle-metastore.firebaseapp.com https://apis.google.com https://content-sheets.googleapis.com/ https://accounts.google.com/ https://storage.googleapis.com https://docs.google.com https://drive.google.com https://calendar.google.com/;", 'X-Content-Type-Options': 'nosniff', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":400,"message":"Submission not allowed:  Your team has used its daily Submission allowance (5) today, please try again tomorrow UTC (6.8 hours from now)."}
