```python
python -m spacy download nl_core_news_lg
python -m spacy download da_core_news_lg
python -m spacy download de_core_news_lg
python -m spacy download it_core_news_lg
python -m spacy download es_core_news_lg
```

In [None]:
import pandas as pd
import langdetect
import spacy
import numpy as np

In [None]:
def preprocess(data):
    data.text.replace(r'\s+|\\n', ' ', regex=True, inplace=True)
    data.text = data.text.str.strip(" -.,")
    # data.text = data.text.str.lower()
    data.text.replace(r'\n', '', regex=True, inplace=True)
    data.text.replace('', np.nan, regex=True, inplace=True)
    data.dropna(inplace=True)
    return data

train_data = pd.read_csv("data/train_data.csv")
train_data = preprocess(train_data)
train_data.reset_index(inplace=True)

test_data = pd.read_csv("data/test_data.csv")
test_data = preprocess(test_data)
test_data.reset_index(inplace=True)


In [None]:
language_mapping = {"Deutsch": "de", "Nederlands": "nl", "dansk": "da", "español": "es", "italiano":"it"}
train_data.replace(language_mapping, inplace=True)

for lang in language_mapping.values():
    train_data.loc[train_data.language == lang][["index", "text", "label"]].to_csv(f"corpus/train/{lang}/{lang}_plain.csv", index=False)

In [None]:
langdetect.DetectorFactory.seed = 21
def language_detector(row):
    try:
        langs = langdetect.detect_langs(row["text"])
        for lang in langs:
            lang = lang.lang
            if lang == "no" or lang == "hr":
                lang = "da"
            elif lang == "af":
                lang = "nl"
                
            if lang in language_mapping.values():
                return lang
            else:
                print("Language?:" ,lang)
                print(row)
                return "da"
    except:
        print(row)
        return "da"

test_data["language"] = test_data.apply(lambda row: language_detector(row), axis=1)

In [None]:
for lang in language_mapping.values():
    test_data.loc[test_data.language == lang][["index", "text"]].to_csv(f"corpus/test/{lang}/{lang}_plain.csv", index=False)

In [None]:
train_data.to_csv(f"corpus/nolang_dif/train_data.csv", index=False)
test_data.to_csv(f"corpus/nolang_dif/test_data.csv", index=False)

In [None]:
spacy_packages = {"nl": "nl_core_news_lg",
"da": "da_core_news_lg",
"de": "de_core_news_lg",
"it": "it_core_news_lg",
"es": "es_core_news_lg",}

def noent(nlp, row):
    try:
        doc = nlp(row)
        sentence = " SENTSEP ".join([" ".join([t.text if t.ent_type == 0 else t.ent_type_ for t in d]) for d in doc.sents])
        return sentence
    except:
        print(row)

def make_noent(df, nlp, path):
    df["text"] = df.text.apply(lambda row: noent(nlp, row))
    df.to_csv(path+"_noent.csv", index=False)


def pos(nlp, row):
    try:
        doc = nlp(row)
        sentence = " SENTSEP ".join([" ".join([t.pos_ for t in d]) for d in doc.sents])
        return sentence
    except:
        print(row)

def make_pos(df, nlp, path):
    df["text"] = df.text.apply(lambda row: pos(nlp, row))
    df.to_csv(path+"_pos.csv", index=False)


In [None]:
for language, spacy_pack in spacy_packages.items():
    train_data = pd.read_csv(f"corpus/train/{language}/{language}_plain.csv")
    test_data = pd.read_csv(f"corpus/test/{language}/{language}_plain.csv")

    nlp = spacy.load(spacy_pack)

    print(f"ready to work '{language}'")
    make_noent(train_data.copy(), nlp, f"corpus/train/{language}/{language}")
    make_noent(test_data.copy(), nlp,  f"corpus/test/{language}/{language}")
    
    print("work work")
    make_pos(train_data.copy(), nlp, f"corpus/train/{language}/{language}")
    make_pos(test_data.copy(), nlp, f"corpus/test/{language}/{language}")
