This notebook is for beginners.

Thanks for [Nicholas's work](https://www.kaggle.com/nbroad/qa-ner-hybrid-train-nbme).

# Import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from ast import literal_eval
from itertools import chain
from sklearn.metrics import precision_recall_fscore_support
from tqdm.notebook import tqdm, trange
from sklearn.model_selection import StratifiedKFold

import torch
!pip install transformers
from transformers import AutoModel, AutoTokenizer

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 11.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.6.0 p

# Create df

In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

ROOT = "drive/My Drive/tar-project"
!pwd

Mounted at /content/drive
/content


In [None]:
!pwd

file_object  = open(f"{ROOT}/write", "w")

/content


In [None]:
from google.colab import drive

def create_train_df(debug = False):
    feats = pd.read_csv(f"{ROOT}/features.csv")
    notes = pd.read_csv(f"{ROOT}/patient_notes.csv")
    train = pd.read_csv(f"{ROOT}/train.csv")

    train["annotation_list"] = [literal_eval(x) for x in train["annotation"]]
    train["location_list"] = [literal_eval(x) for x in train["location"]]
    merged = train.merge(notes, how = "left")
    merged = merged.merge(feats, how = "left")
    merged = merged.loc[merged["annotation"] != "[]"].copy().reset_index(drop = True) # comment out if you train all samples
    
    def process_feature_text(text):
        return text.replace("-OR-", ";-").replace("-", " ")
    merged["feature_text"] = [process_feature_text(x) for x in merged["feature_text"]]
    
    merged["feature_text"] = merged["feature_text"].apply(lambda x: x.lower())
    merged["pn_history"] = merged["pn_history"].apply(lambda x: x.lower())

    if debug:
        merged = merged.sample(frac = 0.5).reset_index(drop = True)

    skf = StratifiedKFold(n_splits = 5)
    merged["stratify_on"] = merged["case_num"].astype(str) + merged["feature_num"].astype(str)
    merged["fold"] = -1
    for fold, (_, valid_idx) in enumerate(skf.split(merged["id"], y = merged["stratify_on"])):
        merged.loc[valid_idx, "fold"] = fold
    
    print(merged.shape)
    return merged

df = create_train_df()

(9901, 12)




In [None]:
df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,annotation_list,location_list,pn_history,feature_text,stratify_on,fold
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],[dad with recent heart attcak],[696 724],hpi: 17yo m presents with palpitations. patien...,family history of mi; family history of myocar...,0,0
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],"[mom with ""thyroid disease]",[668 693],hpi: 17yo m presents with palpitations. patien...,family history of thyroid disorder,1,0
2,00016_002,0,16,2,['chest pressure'],['203 217'],[chest pressure],[203 217],hpi: 17yo m presents with palpitations. patien...,chest pressure,2,0
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']","[intermittent episodes, episode]","[70 91, 176 183]",hpi: 17yo m presents with palpitations. patien...,intermittent symptoms,3,0
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],[felt as if he were going to pass out],[222 258],hpi: 17yo m presents with palpitations. patien...,lightheaded,4,0


In [None]:
first = df.loc[0]
example = {
    "feature_text": first.feature_text,
    "pn_history": first.pn_history,
    "location_list": first.location_list,
    "annotation_list": first.annotation_list
}
for key in example.keys():
    print(key)
    print(example[key])
    print("=" * 100)

feature_text
family history of mi; family history of myocardial infarction
pn_history
hpi: 17yo m presents with palpitations. patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). of note patient endorses abusing adderall, primarily to study (1-3 times per week). before recent soccer game, took adderrall night before and morning of game. denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. 
pmhx: none
rx: uses friends adderrall
fhx: mom with "thyroid disease," dad with recent heart attcak
all: none
immunizations: up to date
shx: freshmen in college. endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. sexually active with girlfrien

In [None]:
def loc_list_to_ints(loc_list):
    to_return = []
    for loc_str in loc_list:
        loc_strs = loc_str.split(";")
        for loc in loc_strs:
            start, end = loc.split()
            to_return.append((int(start), int(end)))
    return to_return

print(example["location_list"])
example_loc_ints = loc_list_to_ints(example["location_list"])[0]
print(example_loc_ints)
print(example["pn_history"][example_loc_ints[0] : example_loc_ints[1]])

['696 724']
(696, 724)
dad with recent heart attcak


nice.

# Tokenizer

In [None]:
MODEL_NAME = "bert-base-uncased" # we cant connect internet for submission
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, )

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
def tokenize_and_add_labels_test(tokenizer, example):
    tokenized_inputs = tokenizer(
        'the',
        example["pn_history"],
        truncation = "only_second",
        max_length = 416, # max length is 406
        padding = "max_length",
        return_offsets_mapping = True
    )
    labels = [0.0] * len(tokenized_inputs["input_ids"])
    tokenized_inputs["location_int"] = loc_list_to_ints(example["location_list"])
    tokenized_inputs["sequence_ids"] = tokenized_inputs.sequence_ids()

    for idx, (seq_id, offsets) in enumerate(zip(tokenized_inputs["sequence_ids"], tokenized_inputs["offset_mapping"])):
        if seq_id is None or seq_id == 0:
            labels[idx] = -100
            continue
        exit = False
        token_start, token_end = offsets
        for feature_start, feature_end in tokenized_inputs["location_int"]:
            if exit:
                break
            if token_start >= feature_start and token_end <= feature_end:
                labels[idx] = 1.0
                exit = True
    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [None]:
tokenized_inputs = tokenize_and_add_labels_test(tokenizer, example)
for key in tokenized_inputs.keys():
    print(key)
    print(tokenized_inputs[key])
    print("=" * 100)

input_ids
[101, 1996, 102, 6522, 2072, 1024, 2459, 7677, 1049, 7534, 2007, 14412, 23270, 10708, 1012, 5776, 4311, 1017, 1011, 1018, 2706, 1997, 23852, 4178, 1997, 1000, 2540, 6012, 1013, 9836, 2041, 1997, 2026, 3108, 1012, 1000, 1016, 2420, 3283, 2076, 1037, 4715, 2208, 2018, 2019, 2792, 1010, 2021, 2023, 2051, 2018, 3108, 3778, 1998, 2371, 2004, 2065, 2002, 2020, 2183, 2000, 3413, 2041, 1006, 2106, 2025, 4558, 9530, 18436, 2791, 1007, 1012, 1997, 3602, 5776, 2203, 5668, 2229, 8273, 7741, 5587, 21673, 2140, 1010, 3952, 2000, 2817, 1006, 1015, 1011, 1017, 2335, 2566, 2733, 1007, 1012, 2077, 3522, 4715, 2208, 1010, 2165, 5587, 2121, 7941, 2140, 2305, 2077, 1998, 2851, 1997, 2208, 1012, 23439, 2460, 2791, 1997, 3052, 1010, 22939, 8458, 16610, 2483, 1010, 9016, 2015, 1010, 10720, 2015, 1010, 14978, 1010, 16342, 1010, 3431, 1999, 3637, 1010, 3431, 1999, 4432, 1013, 4994, 1010, 21419, 29025, 2078, 1010, 3431, 1999, 6812, 2884, 2030, 24471, 3981, 2854, 14243, 1012, 7610, 2232, 2595, 1024, 390

In [None]:
def tokenize_and_add_labels(tokenizer, example):
    tokenized_inputs = tokenizer(
        example["feature_text"],
        example["pn_history"],
        truncation = "only_second",
        max_length = 416, # max length is 406
        padding = "max_length",
        return_offsets_mapping = True
    )
    labels = [0.0] * len(tokenized_inputs["input_ids"])
    tokenized_inputs["location_int"] = loc_list_to_ints(example["location_list"])
    tokenized_inputs["sequence_ids"] = tokenized_inputs.sequence_ids()

    for idx, (seq_id, offsets) in enumerate(zip(tokenized_inputs["sequence_ids"], tokenized_inputs["offset_mapping"])):
        if seq_id is None or seq_id == 0:
            labels[idx] = -100
            continue
        exit = False
        token_start, token_end = offsets
        for feature_start, feature_end in tokenized_inputs["location_int"]:
            if exit:
                break
            if token_start >= feature_start and token_end <= feature_end:
                labels[idx] = 1.0
                exit = True
    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [None]:
tokenized_inputs = tokenize_and_add_labels(tokenizer, example)
for key in tokenized_inputs.keys():
    print(key)
    print(tokenized_inputs[key])
    print("=" * 100)

input_ids
[101, 2155, 2381, 1997, 2771, 1025, 2155, 2381, 1997, 2026, 24755, 25070, 1999, 14971, 7542, 102, 6522, 2072, 1024, 2459, 7677, 1049, 7534, 2007, 14412, 23270, 10708, 1012, 5776, 4311, 1017, 1011, 1018, 2706, 1997, 23852, 4178, 1997, 1000, 2540, 6012, 1013, 9836, 2041, 1997, 2026, 3108, 1012, 1000, 1016, 2420, 3283, 2076, 1037, 4715, 2208, 2018, 2019, 2792, 1010, 2021, 2023, 2051, 2018, 3108, 3778, 1998, 2371, 2004, 2065, 2002, 2020, 2183, 2000, 3413, 2041, 1006, 2106, 2025, 4558, 9530, 18436, 2791, 1007, 1012, 1997, 3602, 5776, 2203, 5668, 2229, 8273, 7741, 5587, 21673, 2140, 1010, 3952, 2000, 2817, 1006, 1015, 1011, 1017, 2335, 2566, 2733, 1007, 1012, 2077, 3522, 4715, 2208, 1010, 2165, 5587, 2121, 7941, 2140, 2305, 2077, 1998, 2851, 1997, 2208, 1012, 23439, 2460, 2791, 1997, 3052, 1010, 22939, 8458, 16610, 2483, 1010, 9016, 2015, 1010, 10720, 2015, 1010, 14978, 1010, 16342, 1010, 3431, 1999, 3637, 1010, 3431, 1999, 4432, 1013, 4994, 1010, 21419, 29025, 2078, 1010, 3431, 19

we need "input_ids" and "attention_mask" for BERT.

labels are 1.0 at annotation.

so we can train as binary classification; does this word(token) represent the feature? -> 1 or 0

# Dataset

In [None]:
class NBMEData(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data.loc[idx]
        tokenized = tokenize_and_add_labels(self.tokenizer, example)

        input_ids = np.array(tokenized["input_ids"]) # for input BERT
        attention_mask = np.array(tokenized["attention_mask"]) # for input BERT
        labels = np.array(tokenized["labels"]) # for calculate loss and cv score

        offset_mapping = np.array(tokenized["offset_mapping"]) # for calculate cv score
        sequence_ids = np.array(tokenized["sequence_ids"]).astype("float16") # for calculate cv score
        
        return input_ids, attention_mask, labels, offset_mapping, sequence_ids

# Model

In [None]:
class NBMEModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(MODEL_NAME) # BERT model
        self.dropout = torch.nn.Dropout(p = 0.2)
        self.classifier = torch.nn.Linear(768, 1) # BERT has last_hidden_state(size: sequqence_length, 768)
    
    def forward(self, input_ids, attention_mask):
        last_hidden_state = self.backbone(input_ids = input_ids, attention_mask = attention_mask)[0] # idx 0 is last_hidden_state; backbone().last_hidden_state is also good
        logits = self.classifier(self.dropout(last_hidden_state)).squeeze(-1)
        return logits

# Training

In [None]:
fold = 0
BATCH_SIZE = 16
EPOCHS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = NBMEModel().to(DEVICE)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-5)

train = df.loc[df["fold"] != fold].reset_index(drop = True)
valid = df.loc[df["fold"] == fold].reset_index(drop = True)
train_ds = NBMEData(train, tokenizer)
valid_ds = NBMEData(valid, tokenizer)

#reducing size, for debugging

#train_part_ds = torch.utils.data.random_split(train_ds, [1000, len(train_ds) - 1000])[0]
#valid_part_ds = torch.utils.data.random_split(valid_ds, [1000, len(valid_ds) - 1000])[0]
#train_ds = train_part_ds
#valid_ds = valid_part_ds

train_dl = torch.utils.data.DataLoader(train_ds, batch_size = BATCH_SIZE, pin_memory = True, shuffle = True, drop_last = True)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size = BATCH_SIZE * 2, pin_memory = True, shuffle = False, drop_last = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def get_location_predictions(preds, offset_mapping, sequence_ids, test = False):
    all_predictions = []
    for pred, offsets, seq_ids in zip(preds, offset_mapping, sequence_ids):
        pred = sigmoid(pred)
        start_idx = None
        current_preds = []
        for p, o, s_id in zip(pred, offsets, seq_ids):
            if s_id is None or s_id == 0:
                continue
            if p > 0.5:
                if start_idx is None:
                    start_idx = o[0]
                end_idx = o[1]
            elif start_idx is not None:
                if test:
                    current_preds.append(f"{start_idx} {end_idx}")
                else:
                    current_preds.append((start_idx, end_idx))
                start_idx = None
        if test:
            all_predictions.append("; ".join(current_preds))
        else:
            all_predictions.append(current_preds)
    return all_predictions

def calculate_char_CV(predictions, offset_mapping, sequence_ids, labels):
    all_labels = []
    all_preds = []
    for preds, offsets, seq_ids, labels in zip(predictions, offset_mapping, sequence_ids, labels):
        num_chars = max(list(chain(*offsets)))
        char_labels = np.zeros((num_chars))
        for o, s_id, label in zip(offsets, seq_ids, labels):
            if s_id is None or s_id == 0:
                continue
            if int(label) == 1:
                char_labels[o[0]:o[1]] = 1
        char_preds = np.zeros((num_chars))
        for start_idx, end_idx in preds:
            char_preds[start_idx:end_idx] = 1
        all_labels.extend(char_labels)
        all_preds.extend(char_preds)
    results = precision_recall_fscore_support(all_labels, all_preds, average = "binary")
    return {
        "precision": results[0],
        "recall": results[1],
        "f1": results[2]
    }

def compute_metrics(p):
    predictions, y_true = p
    y_true = y_true.astype(int)
    y_pred = [
        [int(p > 0.5) for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, y_true)
    ]
    y_true = [
        [l for l in label if l != -100] for label in y_true
    ]
    results = precision_recall_fscore_support(list(chain(*y_true)), list(chain(*y_pred)), average = "binary")
    return {
        "token_precision": results[0],
        "token_recall": results[1],
        "token_f1": results[2]
    }

In [None]:
history = {"train": [], "valid": []}
best_loss = np.inf

for epoch in range(EPOCHS):
    #training
    model.train()
    train_loss = AverageMeter()
    pbar = tqdm(train_dl)
    for batch in pbar:
        optimizer.zero_grad()
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        labels = batch[2].to(DEVICE)
        offset_mapping = batch[3]
        sequence_ids = batch[4]
        logits = model(input_ids, attention_mask)
        loss_fct = torch.nn.BCEWithLogitsLoss(reduction = "none")
        loss = loss_fct(logits, labels)
        loss = torch.masked_select(loss, labels > -1).mean() # we should calculate at "pn_history"; labels at "feature_text" are -100 < -1
        loss.backward()
        optimizer.step()
        train_loss.update(val = loss.item(), n = len(input_ids))
        pbar.set_postfix(Loss = train_loss.avg)
    print(epoch, train_loss.avg)
    history["train"].append(train_loss.avg)

    #evaluation
    model.eval()
    valid_loss = AverageMeter()
    with torch.no_grad():
        for batch in tqdm(valid_dl):
            input_ids = batch[0].to(DEVICE)
            attention_mask = batch[1].to(DEVICE)
            labels = batch[2].to(DEVICE)
            offset_mapping = batch[3]
            sequence_ids = batch[4]
            logits = model(input_ids, attention_mask)
            loss_fct = torch.nn.BCEWithLogitsLoss(reduction = "none")
            loss = loss_fct(logits, labels)
            loss = torch.masked_select(loss, labels > -1).mean()
            valid_loss.update(val = loss.item(), n = len(input_ids))
            pbar.set_postfix(Loss = valid_loss.avg)
    print(epoch, valid_loss.avg)
    history["valid"].append(valid_loss.avg)

    # save model
    if valid_loss.avg < best_loss:
        best_loss = valid_loss.avg
        torch.save(model.state_dict(), f"{ROOT}/nbme.pth")

  0%|          | 0/62 [00:00<?, ?it/s]

0 0.17666456339941394


  0%|          | 0/32 [00:00<?, ?it/s]

0 0.1018117475794819


  0%|          | 0/62 [00:00<?, ?it/s]

1 0.10133300993473418


  0%|          | 0/32 [00:00<?, ?it/s]

1 0.09339132755234097


  0%|          | 0/62 [00:00<?, ?it/s]

2 0.08858563335391532


  0%|          | 0/32 [00:00<?, ?it/s]

2 0.0760340005751475


# Evaluation

In [None]:
model.load_state_dict(torch.load("nbme.pth", map_location = DEVICE))

model.eval()
preds = []
offsets = []
seq_ids = []
lbls = []
with torch.no_grad():
    for batch in tqdm(valid_dl):
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        labels = batch[2].to(DEVICE)
        offset_mapping = batch[3]
        sequence_ids = batch[4]
        logits = model(input_ids, attention_mask)
        preds.append(logits.cpu().numpy())
        offsets.append(offset_mapping.numpy())
        seq_ids.append(sequence_ids.numpy())
        lbls.append(labels.cpu().numpy())
preds = np.concatenate(preds, axis = 0)
offsets = np.concatenate(offsets, axis = 0)
seq_ids = np.concatenate(seq_ids, axis = 0)
lbls = np.concatenate(lbls, axis = 0)
location_preds = get_location_predictions(preds, offsets, seq_ids, test = False)
score = calculate_char_CV(location_preds, offsets, seq_ids, lbls)
print(score)

  0%|          | 0/32 [00:00<?, ?it/s]

{'precision': 0.6522353473124964, 'recall': 0.676277543366151, 'f1': 0.664038898639123}


# Inrefence

In [None]:
def create_test_df():
    feats = pd.read_csv(f"{ROOT}/features.csv")
    notes = pd.read_csv(f"{ROOT}/patient_notes.csv")
    test = pd.read_csv(f"{ROOT}/test.csv")

    merged = test.merge(notes, how = "left")
    merged = merged.merge(feats, how = "left")

    def process_feature_text(text):
        return text.replace("-OR-", ";-").replace("-", " ")
    merged["feature_text"] = [process_feature_text(x) for x in merged["feature_text"]]
    
    print(merged.shape)
    return merged

In [None]:
class NBMETestData(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data.loc[idx]
        tokenized = self.tokenizer(
            example["feature_text"],
            example["pn_history"],
            truncation = "only_second",
            max_length = 416,
            padding = "max_length",
            return_offsets_mapping = True
        )
        tokenized["sequence_ids"] = tokenized.sequence_ids()

        input_ids = np.array(tokenized["input_ids"])
        attention_mask = np.array(tokenized["attention_mask"])
        offset_mapping = np.array(tokenized["offset_mapping"])
        sequence_ids = np.array(tokenized["sequence_ids"]).astype("float16")

        return input_ids, attention_mask, offset_mapping, sequence_ids

In [None]:
test = create_test_df()
test_ds = NBMETestData(test, tokenizer)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size = BATCH_SIZE * 2, pin_memory = True, shuffle = False, drop_last = False)

model.eval()
preds = []
offsets = []
seq_ids = []
with torch.no_grad():
    for batch in tqdm(test_dl):
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        offset_mapping = batch[2]
        sequence_ids = batch[3]
        logits = model(input_ids, attention_mask)
        preds.append(logits.cpu().numpy())
        offsets.append(offset_mapping.numpy())
        seq_ids.append(sequence_ids.numpy())

preds = np.concatenate(preds, axis = 0)
offsets = np.concatenate(offsets, axis = 0)
seq_ids = np.concatenate(seq_ids, axis = 0)

location_preds = get_location_predictions(preds, offsets, seq_ids, test = True)
test["location"] = location_preds
test[["id", "location"]].to_csv("submission.csv", index = False)
pd.read_csv("submission.csv").head()

(5, 6)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,location
0,00016_000,668 671; 696 699; 712 720
1,00016_001,668 693; 694 699
2,00016_002,203 217
3,00016_003,70 91
4,00016_004,29 32
