In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
import os
from pathlib import Path

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle: path = Path('../input/titanic')
else:
    competition = 'titanic'
    path = Path(f'/root/{competition}')
    if not path.exists():
        import zipfile,kaggle
        kaggle.api.competition_download_cli(competition, path=path)
        zipfile.ZipFile(f'{path}/titanic.zip').extractall(path)

In [30]:
!ls {path}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
gender_submission.csv  test.csv  titanic.zip  train.csv


In [3]:
train_csv = pd.read_csv(path/'train.csv')
train_csv.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
train_csv.shape

(891, 12)

In [6]:
train_csv.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
def create_input(row):
    return row['Name'] + ' [SEP] ' + row['Sex']

train_csv['input'] = train_csv.apply(create_input, axis=1)

In [15]:
# Dataset is a huggingface wrapper around our data
from datasets import Dataset

t = train_csv[['input', 'Survived']].rename(columns=
    {'Survived': 'labels'} # needed for transformers to work 
)

ds = Dataset.from_pandas(t)

In [16]:
ds

Dataset({
    features: ['input', 'labels'],
    num_rows: 891
})

### Tokenizer: Tokenization and Numericalisation

In [17]:
from transformers import AutoTokenizer

model_nm = 'microsoft/deberta-v3-small'

#Tokenizers come from pre-trained models, as the vocab ofcourse depends on the data the model was trained on
tokz = AutoTokenizer.from_pretrained(model_nm)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
tokz.vocab_size, tokz.special_tokens_map

(128000,
 {'bos_token': '[CLS]',
  'eos_token': '[SEP]',
  'unk_token': '[UNK]',
  'sep_token': '[SEP]',
  'pad_token': '[PAD]',
  'cls_token': '[CLS]',
  'mask_token': '[MASK]'})

In [19]:
# You can tokenize some input string
tokz.tokenize("Hello, my name is Lucas")

['▁Hello', ',', '▁my', '▁name', '▁is', '▁Lucas']

In [20]:
# Or you can tokenize + numericalise some input string
tokz.encode("Hello, my name is Lucas")

[1, 5365, 261, 312, 601, 269, 10876, 2]

In [21]:
# Or we can do it in two steps
tokz.convert_tokens_to_ids(tokz.tokenize("Hello, my name is Lucas"))

[5365, 261, 312, 601, 269, 10876]

In [22]:
# Or just call the tokenizer, which gives encodings, as well some other information
tokz("Hello, my name is Lucas")

{'input_ids': [1, 5365, 261, 312, 601, 269, 10876, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [152]:
tokz.tokenize(train_csv['input'].iloc[0])

['▁Braun', 'd', ',', '▁Mr', '.', '▁Owen', '▁Harris', '[SEP]', '▁male']

In [24]:
def tokenize(ds_row): return tokz(ds_row['input'])

ds = ds.map(tokenize, batched=True)

# Now we have added some fields to the ds
ds



  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['input', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 891
})

In [26]:
# Make splits
dss = ds.train_test_split(test_size=0.2, seed=42)
dss

DatasetDict({
    train: Dataset({
        features: ['input', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 712
    })
    test: Dataset({
        features: ['input', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 179
    })
})

### Same for the test set

In [35]:
def create_dataset(full_path,
                   tokz,
                   train_set=True,
                   test_size=0.25):

    df = pd.read_csv(full_path)
    
    def create_input(row):
        return row['Name'] + ' [SEP] ' + row['Sex']

    df['input'] = df.apply(create_input, axis=1)
    
    def tokenize(ds_row): return tokz(ds_row['input'])
    
    if train_set:
        df = df.rename(columns={'Survived': 'labels'})
        ds = Dataset.from_pandas(df[['PassengerId', 'input', 'labels']])
        ds = ds.map(tokenize)
        ds = ds.train_test_split(test_size, seed=42)
    else:
        ds = Dataset.from_pandas(df[['PassengerId', 'input']])
        ds = ds.map(tokenize)
    
    return ds

In [36]:
ds = create_dataset(path/'train.csv', tokz)
dst = create_dataset(path/'test.csv', tokz, train_set=False)

  0%|          | 0/891 [00:00<?, ?ex/s]

  0%|          | 0/418 [00:00<?, ?ex/s]

### Create the model

In [126]:
from transformers import TrainingArguments

bs = 128
epochs = 5
lr = 5e-4

args = TrainingArguments(
    output_dir='outputs', #where to store outputs
    learning_rate=lr, #learning rate
    warmup_ratio=0.1, #make sure to do something similar to one_cycle policy
    lr_scheduler_type='cosine', #make sure to do something similar to one_cycle policy
    fp16=True, #use mixed precision
    evaluation_strategy="epoch", #evaluate at the end of each epoch
    per_device_train_batch_size=bs, #train batch size
    per_device_eval_batch_size=bs*2, #eval batch size
    num_train_epochs=epochs, #train for number of epochs
    weight_decay=0.01,  
    report_to='none')

PyTorch: setting up devices


In [127]:
# There are many AutoModelFor... imports, we need the SequenceClassification
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=2)

loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att

In [128]:
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [129]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    tokenizer=tokz,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [130]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: PassengerId, input. If PassengerId, input are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 668
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 30


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.709044,0.439462
2,No log,0.69412,0.560538
3,No log,0.526671,0.784753
4,No log,0.544553,0.789238
5,No log,0.530099,0.784753


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: PassengerId, input. If PassengerId, input are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 223
  Batch size = 256
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: PassengerId, input. If PassengerId, input are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 223
  Batch size = 256
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: PassengerId, input. If PassengerId, input are not expected by `DebertaV2ForSequenceClassification.forward`,  you ca

TrainOutput(global_step=30, training_loss=0.5927075703938802, metrics={'train_runtime': 3.5523, 'train_samples_per_second': 940.227, 'train_steps_per_second': 8.445, 'total_flos': 17233794503232.0, 'train_loss': 0.5927075703938802, 'epoch': 5.0})

In [131]:
preds = trainer.predict(dst)

The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: PassengerId, input. If PassengerId, input are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 418
  Batch size = 256


In [138]:
np.argmax(preds.predictions.squeeze(), axis=1)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [139]:
predictions = np.argmax(preds.predictions.squeeze(), axis=1)

In [143]:
submission = pd.DataFrame.from_dict({
    'PassengerId': dst['PassengerId'],
    'Survived': predictions
})

In [144]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [145]:
submission.to_csv('submission_nlp.csv',index=False)