# Compound challenge
The idea is to generate a trainingset from lowercase version of all all permutation of sub-words from the given compound nouns. Train a model for classification. <br>
In the API we eliminate stopwords from the input, lowercase and remove whitespaces. The encoded string is then processed by the classification model.

## Load pandas library for input of icd data

In [1]:
# !pip install -r requirements.txt
# !pip install -U tensorflow-gpu
import pandas as pd
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


### Load icd data from csv file

In [2]:
df = pd.read_csv('icd.csv', delimiter=';',header=None)

In [3]:
df

Unnamed: 0,0,1
0,Arterienriss,I77.2
1,Harnblaseninfektion,N30.9
2,Klaviculafraktur,S42.00
3,Ovarialzyste,N83.2
4,Schädelprellung,S00.95
5,Schenkelhalsfraktur,S72.00
6,Zungengrundkarzinom,C01


In [4]:
label_to_no = {x[1]:i for i,x in enumerate(df.iloc())} 
print(label_to_no)
no_to_label = {i:x[1] for i,x in enumerate(df.iloc())}

{'I77.2 ': 0, 'N30.9 ': 1, 'S42.00 ': 2, 'N83.2 ': 3, 'S00.95 ': 4, 'S72.00 ': 5, 'C01': 6}


## Load libraries to split German compounds and for creation of permutations 

In [5]:
from itertools import permutations
from compound_split import char_split
import spacy

### 1) Load German language model for spacy
### 2) Split each Compound into the 5 most probable sub-words
### 3) Generate a dataset containing all permutations of the splitted sub-words with the icd label

In [6]:
from datasets import load_dataset, Dataset

nlp = spacy.load("de_core_news_sm")

X = {}

for i in df.iloc():
    ## Generate the first 5 compund splits
    text = [x for x in [' '.join(w[1:]) for w in char_split.split_compound(i[0])][:5]]
    label = i[1]
    for t in text:
        setofwords=[x.lemma_.lower() for x in nlp(t) if not x.is_stop]
        perms = list(permutations(setofwords))
        sperms = list(''.join(p) for p in perms)
        if "text" in X:
            X["text"] += [str(sp) for sp in sperms]
            X["label"] += [int(label_to_no[label]) for sp in sperms]
        else:
            X["text"] = [str(sp) for sp in sperms]
            X["label"] = [int(label_to_no[label]) for sp in sperms]

X_data = Dataset.from_dict(X) # Dataset.from_dict(X)
print(X_data)

Dataset({
    features: ['text', 'label'],
    num_rows: 70
})


### Load transformer libary from huggingface to use distilbert (German) word model as encoder and decoder (with logits on the last layer)

In [7]:
from transformers import AutoTokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-german-cased')

In [9]:
def preprocess_function(x):
    return tokenizer(x["text"], truncation=True, max_length=10, padding='max_length')

X_train = X_data.map(preprocess_function,batched=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 71.92ba/s]


In [10]:
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-german-cased", num_labels=7)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at distilbert-base-german-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=X_train,
    eval_dataset=X_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [13]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 70
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 100


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=100, training_loss=1.051928253173828, metrics={'train_runtime': 2.9922, 'train_samples_per_second': 467.879, 'train_steps_per_second': 33.42, 'total_flos': 3622478412000.0, 'train_loss': 1.051928253173828, 'epoch': 20.0})

In [14]:
import torch
token = tokenizer("riss der arterie im rechten arm", max_length=10, padding='max_length',truncation=True,return_tensors='pt')
labels = torch.tensor([1]).unsqueeze(0)
labels = labels.to(device='cuda')
token = token.to(device='cuda')
token

{'input_ids': tensor([[  102,  9710,   573,   125, 26266, 27533,   223,  7504, 27316,   103]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [15]:
model.eval()
output = model(**token,labels=labels)
print(output)
print(torch.softmax(output.logits,1))
no_to_label[(np.argmax(output.logits.detach().cpu().numpy()))]

SequenceClassifierOutput(loss=tensor(1.8028, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7663,  0.2142, -0.4264, -0.7119,  0.3918, -0.4243, -0.1656]],
       device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[0.2863, 0.1648, 0.0869, 0.0653, 0.1969, 0.0871, 0.1128]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)


'I77.2 '

In [16]:
torch.save(model,'./icd.pt')