In [1]:
!pip install kaggle datasets transformers

Collecting kaggle
  Downloading kaggle-1.5.16.tar.gz (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.6/83.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.16-py3-none-any.whl size=110685 sha256=278378d0ef645ed0bf79b118c09d29ed372a36de4b127175db309635e703b06c
  Stored in directory: /root/.cache/pip/wheels/32/9a/1c/87cb7688472c9240fa865b94c59f8e63c8dd2a8cca1fd4dbb6
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.5.16
[0m

In [3]:
from pathlib import Path
import kaggle, zipfile

competition = 'nlp-getting-started'
path = Path(f'../data/{competition}')
kaggle.api.competition_download_cli(competition)
zipfile.ZipFile(f'{competition}.zip').extractall(path)

nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
!ls {path}

sample_submission.csv  test.csv  train.csv


In [5]:
import pandas as pd

train_df = pd.read_csv(f"{path}/train.csv")
eval_df = pd.read_csv(f"{path}/test.csv")
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [6]:
train_df.describe(include='object')

Unnamed: 0,keyword,location,text
count,7552,5080,7613
unique,221,3341,7503
top,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...
freq,45,104,10


In [46]:
from sklearn.metrics import f1_score
import numpy as np

def f1(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'f1': f1_score(labels, predictions)}

In [8]:
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_nm = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_nm)

def tok_func(x): return tokenizer(x["input"])

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
def build_dataset(df, format_func):
    df['input'] = df.apply(format_func, axis=1)
    ds = Dataset.from_pandas(df)
    ds = ds.map(tok_func, batched=True, remove_columns=["id", "location", "input", "keyword", "text"])
    return ds

def build_train_dataset(df, format_func):
    ds = build_dataset(df, format_func)
    ds = ds.rename_columns({'target': 'label'})
    dds = ds.train_test_split(0.25, seed=42)
    return dds

In [29]:
def format_input(r): return f"KW: {r['keyword']}; LOC: {r['location']}; INTEXT: {r['text']}"

In [47]:
id2label = {0: "NO DISASTER", 1: "DISASTER"}
label2id = {"NO DISASTER": 0, "DISASTER": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=2, id2label=id2label, label2id=label2id)

loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NO DISASTER",
    "1": "DISASTER"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "DISASTER": 1,
    "NO DISASTER": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
   

In [48]:
from transformers import TrainingArguments, Trainer

bs = 128
epochs = 4
lr = 1e-5

In [51]:
args = TrainingArguments('output', 
                         learning_rate=lr, 
                         warmup_ratio=0.1, 
                         lr_scheduler_type='cosine',
                         optim="adamw_torch",
                         fp16=True, 
                         evaluation_strategy='epoch',
                         save_strategy='epoch',
                         per_device_train_batch_size=bs, 
                         per_device_eval_batch_size=bs*2, 
                         num_train_epochs=epochs, 
                         weight_decay=0.01, 
                         report_to='none',
                         load_best_model_at_end=True)

PyTorch: setting up devices


In [52]:
dds = build_train_dataset(train_df, format_input)

  0%|          | 0/8 [00:00<?, ?ba/s]

In [56]:
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'], tokenizer=tokenizer, compute_metrics=f1)

Using cuda_amp half precision backend


In [57]:
trainer.train()

***** Running training *****
  Num examples = 5709
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 180


Epoch,Training Loss,Validation Loss,F1
1,No log,0.41474,0.756949
2,No log,0.407614,0.789841
3,No log,0.402578,0.787206
4,No log,0.401934,0.790728


***** Running Evaluation *****
  Num examples = 1904
  Batch size = 256
Saving model checkpoint to output/checkpoint-45
Configuration saved in output/checkpoint-45/config.json
Model weights saved in output/checkpoint-45/pytorch_model.bin
tokenizer config file saved in output/checkpoint-45/tokenizer_config.json
Special tokens file saved in output/checkpoint-45/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1904
  Batch size = 256
Saving model checkpoint to output/checkpoint-90
Configuration saved in output/checkpoint-90/config.json
Model weights saved in output/checkpoint-90/pytorch_model.bin
tokenizer config file saved in output/checkpoint-90/tokenizer_config.json
Special tokens file saved in output/checkpoint-90/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1904
  Batch size = 256
Saving model checkpoint to output/checkpoint-135
Configuration saved in output/checkpoint-135/config.json
Model weights saved in output/checkpoint-135/pytor

TrainOutput(global_step=180, training_loss=0.35693329705132376, metrics={'train_runtime': 209.796, 'train_samples_per_second': 108.849, 'train_steps_per_second': 0.858, 'total_flos': 468430203565464.0, 'train_loss': 0.35693329705132376, 'epoch': 4.0})

In [58]:
eval_ds = build_dataset(eval_df, format_input)
predictions = trainer.predict(eval_ds)
predictions


  0%|          | 0/4 [00:00<?, ?ba/s]

***** Running Prediction *****
  Num examples = 3263
  Batch size = 256


PredictionOutput(predictions=array([[-1.782 ,  1.79  ],
       [-0.2507,  0.3674],
       [-1.086 ,  1.122 ],
       ...,
       [-2.594 ,  2.45  ],
       [-1.157 ,  1.215 ],
       [-1.082 ,  1.166 ]], dtype=float16), label_ids=None, metrics={'test_runtime': 8.3847, 'test_samples_per_second': 389.163, 'test_steps_per_second': 1.55})

In [62]:
output = np.argmax(predictions.predictions, axis=1)

In [70]:
submission_df = pd.DataFrame({'id': eval_df['id'], 'target': output})
submission_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [72]:
submission_df.to_csv('submission.csv', index=False)
kaggle.api.competition_submit('submission.csv', 'Initial submission', competition)

100%|██████████| 22.2k/22.2k [00:00<00:00, 38.3kB/s]


Successfully submitted to Natural Language Processing with Disaster Tweets