# Compound challenge
The idea is to generate a trainingset from lowercase version of all all permutation of sub-words from the given compound nouns. Train a model for classification. <br>
In the API we eliminate stopwords from the input, lowercase and remove whitespaces. The encoded string is then processed by the classification model.

## Load pandas library for input of icd data

In [None]:
# !pip install -r requirements.txt
# !pip install -U tensorflow-gpu
import pandas as pd
import torch
import numpy as np

### Load icd data from csv file

In [None]:
df = pd.read_csv('icd.csv', delimiter=';',header=None)

In [None]:
df

In [None]:
label_to_no = {x[1]:i for i,x in enumerate(df.iloc())} 
print(label_to_no)
no_to_label = {i:x[1] for i,x in enumerate(df.iloc())}

## Load libraries to split German compounds and for creation of permutations 

In [None]:
from itertools import permutations
from compound_split import char_split
import spacy

### 1) Load German language model for spacy
### 2) Split each Compound into the 5 most probable sub-words
### 3) Generate a dataset containing all permutations of the splitted sub-words with the icd label

In [None]:
from datasets import load_dataset, Dataset

nlp = spacy.load("de_core_news_sm")

X = {}

for i in df.iloc():
    ## Generate the first 5 compund splits
    text = [x for x in [' '.join(w[1:]) for w in char_split.split_compound(i[0])][:5]]
    label = i[1]
    for t in text:
        setofwords=[x.lemma_.lower() for x in nlp(t) if not x.is_stop]
        perms = list(permutations(setofwords))
        sperms = list(''.join(p) for p in perms)
        if "text" in X:
            X["text"] += [str(sp) for sp in sperms]
            X["label"] += [int(label_to_no[label]) for sp in sperms]
        else:
            X["text"] = [str(sp) for sp in sperms]
            X["label"] = [int(label_to_no[label]) for sp in sperms]

X_data = Dataset.from_dict(X) # Dataset.from_dict(X)
print(X_data)

### Load transformer libary from huggingface to use distilbert (German) word model as encoder and decoder (with logits on the last layer)

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-german-cased')

In [None]:
def preprocess_function(x):
    return tokenizer(x["text"], truncation=True, max_length=10, padding='max_length')

X_train = X_data.map(preprocess_function,batched=True)

In [None]:
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-german-cased", num_labels=7)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=X_train,
    eval_dataset=X_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
import torch
token = tokenizer("riss der arterie im rechten arm", max_length=10, padding='max_length',truncation=True,return_tensors='pt')
labels = torch.tensor([1]).unsqueeze(0)
labels = labels.to(device='cuda')
token = token.to(device='cuda')
token

In [None]:
model.eval()
output = model(**token,labels=labels)
print(output)
print(torch.softmax(output.logits,1))
no_to_label[(np.argmax(output.logits.detach().cpu().numpy()))]

In [None]:
torch.save(model,'./icd.pt')