# Compound challenge
The idea is to generate a trainingset from lowercase of all all permutation of sub-words from the given compound nouns. Train a model for classification. <br>
In the API we eliminate stopwords from the input, lowercase and remove whitespaces. The encoded string is then processed by the classification model.

## Load pandas library for input of icd data

In [14]:
# !pip install -q -r requirements.txt
import pandas as pd

### Load icd data from csv file

In [11]:
df = pd.read_csv('icd.csv', delimiter=';',header=None)

In [12]:
df

Unnamed: 0,0,1
0,Arterienriss,I77.2
1,Harnblaseninfektion,N30.9
2,Klaviculafraktur,S42.00
3,Ovarialzyste,N83.2
4,Schädelprellung,S00.95
5,Schenkelhalsfraktur,S72.00
6,Zungengrundkarzinom,C01


## Load libraries to split German compunds and for creation of permutations 

In [35]:
from itertools import permutations
from compound_split import char_split
import spacy

  from .autonotebook import tqdm as notebook_tqdm


### 1) Load German language model for spacy
### 2) Split each Compound into the 5 most probable sub-words
### 3) Generate a dataset containing all permutations of the splitted sub-words with the icd label

In [53]:
nlp = spacy.load("de_core_news_sm")

X = []

for i in df.iloc():
    # print(i[0])
    ## Generate the first 5 compund splits
    text = [x for x in [' '.join(w[1:]) for w in char_split.split_compound(i[0])][:5]]
    label = i[1]
    # print(text)
    for t in text:
        setofwords=[x.lemma_.lower() for x in nlp(t) if not x.is_stop]
        perms = list(permutations(setofwords))
        sperms = list(''.join(p) for p in perms)
        X += [[sp,label]  for sp in sperms]
print(X)

[['arterieriss', 'I77.2 '], ['rissarterie', 'I77.2 '], ['arterienriss', 'I77.2 '], ['erienrissart', 'I77.2 '], ['arterienriss', 'I77.2 '], ['ienrissarter', 'I77.2 '], ['arterienriss', 'I77.2 '], ['rienrissarte', 'I77.2 '], ['arterienriss', 'I77.2 '], ['nrissarterie', 'I77.2 '], ['harnblaseinfektion', 'N30.9 '], ['infektionharnblase', 'N30.9 '], ['harnblaseninfektion', 'N30.9 '], ['blaseninfektionharn', 'N30.9 '], ['harnblaseninfektion', 'N30.9 '], ['nblaseninfektionhar', 'N30.9 '], ['harnblaseninfektion', 'N30.9 '], ['ninfektionharnblase', 'N30.9 '], ['harnblaseninfektion', 'N30.9 '], ['ionharnblaseninfekt', 'N30.9 '], ['klaviculafraktur', 'S42.00 '], ['frakturklavicula', 'S42.00 '], ['klaviculafraktur', 'S42.00 '], ['turklaviculafrak', 'S42.00 '], ['klaviculafraktur', 'S42.00 '], ['iculafrakturklav', 'S42.00 '], ['klaviculafraktur', 'S42.00 '], ['viculafrakturkla', 'S42.00 '], ['klaviculafraktur', 'S42.00 '], ['lafrakturklavicu', 'S42.00 '], ['ovarialzyste', 'N83.2 '], ['rialzysteova'