<img align="right" width="400" src="https://www.fhnw.ch/de/++theme++web16theme/assets/media/img/fachhochschule-nordwestschweiz-fhnw-logo.svg" alt="FHNW Logo">


# Data Augmentation Back Translation with Transformers

by Fabian Märki

## Summary
The aim of this notebook is to show how Huggingface's model can be used for back translation.

### Sources
- https://amitness.com/back-translation/
- https://github.com/huggingface/transformers/issues/9994

### Libraries/Models
- https://huggingface.co
- https://huggingface.co/models?language=de&pipeline_tag=translation&sort=downloads&search=Helsinki-NLP

<a href="https://colab.research.google.com/github/markif/2021_HS_DAS_NLP_Notebooks/blob/master/08_b_Data_Augmentation-Back-Translation-Transformers.ipynb">
  <img align="left" src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
!pip install 'fhnw-nlp-utils>=0.1.6,<0.2'

from fhnw.nlp.utils.processing import parallelize_dataframe
from fhnw.nlp.utils.processing import is_iterable
from fhnw.nlp.utils.storage import download
from fhnw.nlp.utils.storage import save_dataframe
from fhnw.nlp.utils.storage import load_dataframe

import numpy as np
import pandas as pd
from datetime import datetime

import tensorflow as tf

print("Tensorflow version:", tf.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Tensorflow version: 2.5.1
GPU is available


In [2]:
%%time
download("https://drive.google.com/uc?id=19AFeVnOfX8WXU4_3rM7OFoNTWWog_sb_", "data/german_doctor_reviews_tokenized.parq")
data = load_dataframe("data/german_doctor_reviews_tokenized.parq")
data.shape

CPU times: user 7.86 s, sys: 1.21 s, total: 9.07 s
Wall time: 5.16 s


(350087, 10)

In [3]:
data.head(3)

Unnamed: 0,text_original,rating,text,label,sentiment,token_clean,text_clean,token_lemma,token_stem,token_clean_stopwords
0,Ich bin franzose und bin seit ein paar Wochen ...,2.0,Ich bin franzose und bin seit ein paar Wochen ...,positive,1,"[ich, bin, franzose, und, bin, seit, ein, paar...",ich bin franzose und bin seit ein paar wochen ...,"[franzose, seit, paar, wochen, muenchen, zahn,...","[franzos, seit, paar, woch, muench, ., zahn, s...","[franzose, seit, paar, wochen, muenchen, ., za..."
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,Dieser Arzt ist das unmöglichste was mir in me...,negative,-1,"[dieser, arzt, ist, das, unmöglichste, was, mi...",dieser arzt ist das unmöglichste was mir in me...,"[arzt, unmöglichste, leben, je, begegnen, unfr...","[arzt, unmog, leb, je, begegnet, unfreund, ,, ...","[arzt, unmöglichste, leben, je, begegnet, unfr..."
2,Hatte akute Beschwerden am Rücken. Herr Magura...,1.0,Hatte akute Beschwerden am Rücken. Herr Magura...,positive,1,"[hatte, akute, beschwerden, am, rücken, ., her...",hatte akute beschwerden am rücken . herr magur...,"[akut, beschwerden, rücken, magura, erste, arz...","[akut, beschwerd, ruck, ., magura, erst, arzt,...","[akute, beschwerden, rücken, ., magura, erste,..."


In [4]:
data = data.drop(["token_clean", "token_lemma", "token_stem", "token_clean_stopwords", "text_clean"], axis=1)

In [5]:
data.head(3)

Unnamed: 0,text_original,rating,text,label,sentiment
0,Ich bin franzose und bin seit ein paar Wochen ...,2.0,Ich bin franzose und bin seit ein paar Wochen ...,positive,1
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,Dieser Arzt ist das unmöglichste was mir in me...,negative,-1
2,Hatte akute Beschwerden am Rücken. Herr Magura...,1.0,Hatte akute Beschwerden am Rücken. Herr Magura...,positive,1


In [6]:
# only keep negative text (the class with fewer samples)
data_augm = data[data["label"] == "negative"]

data_augm.head(3)

Unnamed: 0,text_original,rating,text,label,sentiment
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,Dieser Arzt ist das unmöglichste was mir in me...,negative,-1
13,1. Termin:<br />\n1 Stunde Wartezimmer + 2 min...,6.0,. Termin Stunde Wartezimmer minütige Behandlu...,negative,-1
19,"Eine sehr unfreundliche Ärztin, so etwas habe ...",6.0,"Eine sehr unfreundliche Ärztin, so etwas habe ...",negative,-1


In [7]:
#data_augm = data_augm.reset_index(drop=True)

In [8]:
!pip install torch sentencepiece mosestokenizer

Collecting mosestokenizer
  Downloading mosestokenizer-1.1.0.tar.gz (37 kB)
Collecting docopt
  Downloading docopt-0.6.2.tar.gz (25 kB)
Collecting openfile
  Downloading openfile-0.0.7-py3-none-any.whl (2.4 kB)
Collecting uctools
  Downloading uctools-1.3.0.tar.gz (4.6 kB)
Collecting toolwrapper
  Downloading toolwrapper-2.1.0.tar.gz (3.2 kB)
Building wheels for collected packages: mosestokenizer, docopt, uctools, toolwrapper
  Building wheel for mosestokenizer (setup.py) ... [?25ldone
[?25h  Created wheel for mosestokenizer: filename=mosestokenizer-1.1.0-py3-none-any.whl size=49117 sha256=a57c65d1aa2b9dd6bf64a46e87cd44c91867d1a0ae92c2acc107a15801831b08
  Stored in directory: /root/.cache/pip/wheels/66/2f/b1/451aa60031d370c81c754ed7b793d208e69a19b29c2376e6e4
  Building wheel for docopt (setup.py) ... [?25ldone
[?25h  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13724 sha256=8a30c05ac488cde24df8e0591600ef20788c8d16e9a23f5b2b4ae21812728859
  Stored in dir

In [9]:
def gpu_empty_cache():
    """Cleans the GPU cache which seems to fill up after a while
    
    """
        
    import torch
    import tensorflow as tf

    if tf.config.list_physical_devices("GPU"):
        torch.cuda.empty_cache()
    
def get_gpu_device_number():
    """Provides the number of the GPU device
    
    Returns
    -------
    int
        The GPU device number of -1 if none is installed
    """
        
    import tensorflow as tf
    
    return 0 if tf.config.list_physical_devices("GPU") else -1

def get_compute_device():
    """Provides the device for the computation
    
    Returns
    -------
    str
        The GPU device with number (cuda:0) of cpu
    """
        
    import tensorflow as tf
    
    return "cuda:0" if tf.config.list_physical_devices("GPU") else "cpu"

### Back Translation

You might repeate following steps for several languages.

In [10]:
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from transformers import MarianMTModel, MarianTokenizer

lang_from = "de"
lang_to = "es"
compute_device = get_compute_device()

#orig2dest_model_name = "facebook/wmt19-"+lang_from+"-"+lang_to
#orig2dest_tokenizer = FSMTTokenizer.from_pretrained(orig2dest_model_name)
#orig2dest_model = FSMTForConditionalGeneration.from_pretrained(orig2dest_model_name).to(device)

#dest2orig_model_name = "facebook/wmt19-"+lang_to+"-"+lang_from
#dest2orig_tokenizer = FSMTTokenizer.from_pretrained(dest2orig_model_name)
#dest2orig_model = FSMTForConditionalGeneration.from_pretrained(dest2orig_model_name).to(device)


orig2dest_model_name = "Helsinki-NLP/opus-mt-"+lang_from+"-"+lang_to
orig2dest_tokenizer = MarianTokenizer.from_pretrained(orig2dest_model_name)
orig2dest_model = MarianMTModel.from_pretrained(orig2dest_model_name).to(compute_device)

dest2orig_model_name = "Helsinki-NLP/opus-mt-"+lang_to+"-"+lang_from
dest2orig_tokenizer = MarianTokenizer.from_pretrained(dest2orig_model_name)
dest2orig_model = MarianMTModel.from_pretrained(dest2orig_model_name).to(compute_device)

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/818k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/304M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/818k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/304M [00:00<?, ?B/s]

In [11]:
# see: https://github.com/huggingface/transformers/issues/9994

def back_translate_transformers(texts):
    #tokenized_texts = orig2dest_tokenizer.prepare_seq2seq_batch(texts, return_tensors="pt").to(compute_device)
    tokenized_texts = orig2dest_tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(compute_device)
    back_translations = [set() for _ in range(len(texts))]

    # Translate texts to German and back to English
    generate_kwargs = {"num_beams": 1, "do_sample": True, "num_return_sequences": 2}
    tokenized_de_texts = orig2dest_model.generate(tokenized_texts["input_ids"], attention_mask=tokenized_texts["attention_mask"], top_p=0.7, **generate_kwargs)
    tokenized_en_texts = dest2orig_model.generate(tokenized_de_texts, top_p=0.8, **generate_kwargs)

    # Decode and deduplicate back-translations and assign to original text indices
    for i, t in enumerate(tokenized_en_texts):
        back_translations[i // 4].add(dest2orig_tokenizer.decode(t, skip_special_tokens=True).lower())

    # Remove back translations that are empty or equal to the original text
    return [[bt for bt in s if bt and bt != t] for s, t in zip(back_translations, map(str.lower, texts))]

Give it a try...

In [12]:
back_translate_transformers(["Hallo zusammen! Wie geht es euch heute?", "NLP ist grossartig, oder?"])

[['hallo, ihr. wie geht es ihnen heute?',
  '- hi allerseits.',
  'hallo, wie geht es ihnen heute?',
  "hallo, wie geht's euch heute?"],
 ['der nip ist cool, was?',
  '- ist die nlp toll?',
  '- ist die nlp cool?',
  '- die ntp ist toll, oder?']]

Do the actual back translation. Following code allows for recovery in case of a crash...

In [13]:
%%time

batch_size = 5
save_every_n_elements = 50
translations = []
last_stored = -1 #8409

# set to the last stored index for recovery
if last_stored >= 0:
    data_trans = load_dataframe("data/german_doctor_reviews_augmented_tmp.parq")
    translations = [row.to_dict() for index, row in data_trans.iterrows()]
    print("Loaded", len(translations))

for g, df in data_augm.groupby(np.arange(len(data_augm)) // batch_size):
    if g > last_stored:
        gpu_empty_cache()
        back_trans = back_translate_transformers(df["text_original"].to_list())
    
        i = 0
        for index, row in df.iterrows():
            
            for trans in back_trans[i]:
                row_dict = row.to_dict()
                row_dict["text"] = trans
                translations.append(row_dict)
           
            i += 1
    
        if (g + 1) % (save_every_n_elements // batch_size) == 0:
            print(datetime.now(), "save ", g, len(translations))
        
            save_dataframe(pd.DataFrame(translations), "data/german_doctor_reviews_augmented_tmp.parq")
            
    else:
        print("Skip", g)
              
    
save_dataframe(pd.DataFrame(translations), "data/german_doctor_reviews_augmented_tmp.parq")

2021-09-30 16:14:28.581990 save  9 199
2021-09-30 16:15:27.466250 save  19 398
2021-09-30 16:16:09.477912 save  29 598
2021-09-30 16:16:43.926394 save  39 795
2021-09-30 16:17:29.490674 save  49 994
2021-09-30 16:18:11.488057 save  59 1193
2021-09-30 16:18:56.427819 save  69 1392
2021-09-30 16:19:32.898552 save  79 1591
2021-09-30 16:20:17.333373 save  89 1791
2021-09-30 16:21:08.328024 save  99 1991
2021-09-30 16:21:52.241590 save  109 2191
2021-09-30 16:22:36.931620 save  119 2390
2021-09-30 16:23:21.854317 save  129 2590
2021-09-30 16:24:10.502879 save  139 2790
2021-09-30 16:24:53.395143 save  149 2990
2021-09-30 16:25:46.638448 save  159 3188
2021-09-30 16:26:25.059373 save  169 3388
2021-09-30 16:27:10.109560 save  179 3588
2021-09-30 16:27:55.302234 save  189 3786
2021-09-30 16:28:36.984998 save  199 3985
2021-09-30 16:29:26.156602 save  209 4185
2021-09-30 16:30:25.026910 save  219 4383
2021-09-30 16:31:11.615555 save  229 4582
2021-09-30 16:32:01.401634 save  239 4780
2021-09-

2021-09-30 18:39:06.346339 save  1909 38053
2021-09-30 18:39:57.109840 save  1919 38250
2021-09-30 18:40:41.954329 save  1929 38449
2021-09-30 18:41:28.670173 save  1939 38648
2021-09-30 18:42:14.958391 save  1949 38848
2021-09-30 18:43:09.035417 save  1959 39048
2021-09-30 18:44:01.762429 save  1969 39245
2021-09-30 18:44:54.184229 save  1979 39444
2021-09-30 18:45:36.878488 save  1989 39644
2021-09-30 18:46:15.549642 save  1999 39844
2021-09-30 18:47:14.063875 save  2009 40043
2021-09-30 18:47:54.741755 save  2019 40243
2021-09-30 18:48:43.455659 save  2029 40443
2021-09-30 18:49:34.293123 save  2039 40643
2021-09-30 18:50:16.677172 save  2049 40842
2021-09-30 18:50:58.932706 save  2059 41042
2021-09-30 18:51:45.110636 save  2069 41242
2021-09-30 18:52:31.293945 save  2079 41442
2021-09-30 18:53:21.071741 save  2089 41642
2021-09-30 18:54:02.232859 save  2099 41842
2021-09-30 18:54:51.160808 save  2109 42042
2021-09-30 18:55:25.335439 save  2119 42242
2021-09-30 18:56:06.727222 save 

2021-09-30 20:56:24.851881 save  3779 75333
2021-09-30 20:57:09.037428 save  3789 75532
2021-09-30 20:57:43.227928 save  3799 75729
2021-09-30 20:58:24.623654 save  3809 75927
2021-09-30 20:59:02.765248 save  3819 76126
2021-09-30 20:59:46.589289 save  3829 76326
2021-09-30 21:00:37.486070 save  3839 76526
2021-09-30 21:01:14.095785 save  3849 76725
2021-09-30 21:02:01.980156 save  3859 76924
2021-09-30 21:02:39.746068 save  3869 77124
2021-09-30 21:03:21.514918 save  3879 77324
2021-09-30 21:04:03.151929 save  3889 77524
2021-09-30 21:04:50.331329 save  3899 77721
2021-09-30 21:05:23.015141 save  3909 77921
2021-09-30 21:05:57.540023 save  3919 78120
2021-09-30 21:06:39.737192 save  3929 78320
2021-09-30 21:07:21.753547 save  3939 78520
2021-09-30 21:08:09.140478 save  3949 78720
2021-09-30 21:08:54.659712 save  3959 78920
2021-09-30 21:09:36.719817 save  3969 79120
2021-09-30 21:10:23.147968 save  3979 79320
2021-09-30 21:11:08.736805 save  3989 79520
2021-09-30 21:11:50.835074 save 

2021-09-30 23:11:45.126091 save  5629 112213
2021-09-30 23:12:34.985776 save  5639 112412
2021-09-30 23:13:20.037377 save  5649 112612
2021-09-30 23:14:03.288232 save  5659 112812
2021-09-30 23:14:45.681058 save  5669 113012
2021-09-30 23:15:37.469963 save  5679 113212
2021-09-30 23:16:37.050771 save  5689 113411
2021-09-30 23:17:11.315244 save  5699 113608
2021-09-30 23:17:52.832229 save  5709 113808
2021-09-30 23:18:41.298598 save  5719 114004
2021-09-30 23:19:27.865085 save  5729 114204
2021-09-30 23:20:21.590696 save  5739 114404
2021-09-30 23:21:21.007563 save  5749 114603
2021-09-30 23:22:04.696356 save  5759 114803
2021-09-30 23:22:51.526262 save  5769 115002
2021-09-30 23:23:37.535647 save  5779 115202
2021-09-30 23:24:27.508904 save  5789 115399
2021-09-30 23:25:15.037610 save  5799 115599
2021-09-30 23:26:01.409787 save  5809 115799
2021-09-30 23:26:41.011530 save  5819 115998
2021-09-30 23:27:29.521209 save  5829 116198
2021-09-30 23:28:22.947242 save  5839 116398
2021-09-30

In [14]:
save_data = pd.DataFrame(translations)

In [15]:
save_data.head(3)

Unnamed: 0,text_original,rating,text,label,sentiment
0,Dieser Arzt ist das unmöglichste was mir in me...,6.0,"dieser arzt ist das unmöglichste, das ich je i...",negative,-1
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,"dieser arzt ist das unmöglichste, was ich jema...",negative,-1
2,Dieser Arzt ist das unmöglichste was mir in me...,6.0,dieser arzt ist am wenigsten unmöglich in mein...,negative,-1


In [16]:
save_dataframe(save_data, "data/german_doctor_reviews_augmented_translated_"+lang_to+".parq")

Load all the back translated text and perform normalization of the augmented data.

In [17]:
import glob
files = glob.glob("data/german_doctor_reviews*augmented_trans*_[a-z][a-z].parq")
print(files)

dataframes = []
for file in files:
    data_aug = load_dataframe(file)
    dataframes.append(data_aug)
    
data_aug = pd.concat(dataframes)

['data/german_doctor_reviews_augmented_translated_es.parq']


In [18]:
data_aug.shape

(131629, 5)

In [19]:
data_aug.head(3)

Unnamed: 0,text_original,rating,text,label,sentiment
0,Dieser Arzt ist das unmöglichste was mir in me...,6.0,"dieser arzt ist das unmöglichste, das ich je i...",negative,-1
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,"dieser arzt ist das unmöglichste, was ich jema...",negative,-1
2,Dieser Arzt ist das unmöglichste was mir in me...,6.0,dieser arzt ist am wenigsten unmöglich in mein...,negative,-1


In [20]:
from fhnw.nlp.utils.normalize import tokenize
from fhnw.nlp.utils.normalize import tokenize_stem
from fhnw.nlp.utils.normalize import tokenize_lemma
from fhnw.nlp.utils.normalize import normalize
from fhnw.nlp.utils.normalize import normalize_df
from fhnw.nlp.utils.text import clean_text_df
from fhnw.nlp.utils.text import join_tokens_df

In [21]:
!pip install 'spacy>=3.0.5'
!pip install nltk

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')

import spacy
!python3 -m spacy download de_core_news_lg

nlp = spacy.load("de_core_news_lg")

stemmer = SnowballStemmer("german")
empty_stopwords = set()
stopwords = set(stopwords.words("german"))
n_cores = 4

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2021-10-01 00:28:04.728499: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_lg')


In [22]:
%%time

# make sure we did not introduce maleformated stuff
data_aug = data_aug.rename(columns={"text": "text_tmp"})
data_aug = parallelize_dataframe(data_aug, clean_text_df, n_cores=n_cores, field_read="text_tmp", field_write="text", keep_punctuation=True)
data_aug = data_aug.drop(columns=["text_tmp"], errors="ignore")

CPU times: user 725 ms, sys: 416 ms, total: 1.14 s
Wall time: 2.82 s


In [23]:
%%time
save_dataframe(data_aug, "data/german_doctor_reviews_augmented_tokenized_tmp.parq")

CPU times: user 6.1 s, sys: 112 ms, total: 6.21 s
Wall time: 6.2 s


In [24]:
%%time
data_aug = parallelize_dataframe(data_aug, normalize_df, n_cores=n_cores, field_read="text", field_write="token_clean", stopwords=empty_stopwords, stemmer=None, lemmanizer=None, lemma_with_ner=False)

CPU times: user 4.29 s, sys: 656 ms, total: 4.95 s
Wall time: 31 s


In [25]:
%%time
save_dataframe(data_aug, "data/german_doctor_reviews_augmented_tokenized_tmp.parq")

CPU times: user 10.3 s, sys: 204 ms, total: 10.5 s
Wall time: 10.5 s


In [26]:
%%time
data_aug = parallelize_dataframe(data_aug, join_tokens_df, n_cores=n_cores, field_read="token_clean", field_write="text_clean", stopwords=empty_stopwords)

CPU times: user 31.2 s, sys: 1.18 s, total: 32.4 s
Wall time: 33.7 s


In [27]:
%%time
save_dataframe(data_aug, "data/german_doctor_reviews_augmented_tokenized_tmp.parq")

CPU times: user 14.7 s, sys: 312 ms, total: 15 s
Wall time: 14.9 s


In [28]:
%%time
data_aug = parallelize_dataframe(data_aug, normalize_df, n_cores=n_cores, field_read="token_clean", field_write="token_lemma", stopwords=stopwords, stemmer=None, lemmanizer=nlp, lemma_with_ner=False)

CPU times: user 1min, sys: 6.34 s, total: 1min 6s
Wall time: 3min 47s


In [29]:
%%time
save_dataframe(data_aug, "data/german_doctor_reviews_augmented_tokenized_tmp.parq")

CPU times: user 15.9 s, sys: 308 ms, total: 16.2 s
Wall time: 16.2 s


In [30]:
%%time
data_aug = parallelize_dataframe(data_aug, normalize_df, n_cores=n_cores, field_read="token_clean", field_write="token_stem", stopwords=stopwords, stemmer=stemmer, lemmanizer=None, lemma_with_ner=False)

CPU times: user 33 s, sys: 1.59 s, total: 34.6 s
Wall time: 47.5 s


In [31]:
%%time
save_dataframe(data_aug, "data/german_doctor_reviews_augmented_tokenized_tmp.parq")

CPU times: user 18.4 s, sys: 384 ms, total: 18.7 s
Wall time: 18.7 s


In [32]:
%%time
data_aug = parallelize_dataframe(data_aug, normalize_df, n_cores=n_cores, field_read="token_clean", field_write="token_clean_stopwords", stopwords=stopwords, stemmer=None, lemmanizer=None, lemma_with_ner=False)

CPU times: user 34.2 s, sys: 1.45 s, total: 35.6 s
Wall time: 42.1 s


In [33]:
%%time
save_dataframe(data_aug, "data/german_doctor_reviews_augmented_tokenized_tmp.parq")

CPU times: user 20.8 s, sys: 396 ms, total: 21.1 s
Wall time: 21.1 s


In [34]:
data_aug = data_aug[data_aug["token_lemma"].map(len) > 1 ]

In [35]:
data_aug.head(3)

Unnamed: 0,text_original,rating,label,sentiment,text,token_clean,text_clean,token_lemma,token_stem,token_clean_stopwords
0,Dieser Arzt ist das unmöglichste was mir in me...,6.0,negative,-1,"dieser arzt ist das unmöglichste, das ich je i...","[dieser, arzt, ist, das, unmöglichste, ,, das,...","dieser arzt ist das unmöglichste , das ich je ...","[arzt, unmöglichste, je, leben, triefen, böswi...","[arzt, unmog, ,, je, leb, getroff, ,, boswill,...","[arzt, unmöglichste, ,, je, leben, getroffen, ..."
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,negative,-1,"dieser arzt ist das unmöglichste, was ich jema...","[dieser, arzt, ist, das, unmöglichste, ,, was,...","dieser arzt ist das unmöglichste , was ich jem...","[arzt, unmöglichste, jemals, leben, kennen, ve...","[arzt, unmog, ,, jemal, leb, kannt, ,, versaut...","[arzt, unmöglichste, ,, jemals, leben, kannte,..."
2,Dieser Arzt ist das unmöglichste was mir in me...,6.0,negative,-1,dieser arzt ist am wenigsten unmöglich in mein...,"[dieser, arzt, ist, am, wenigsten, unmöglich, ...",dieser arzt ist am wenigsten unmöglich in mein...,"[arzt, wenig, unmöglich, leben, finden, unfreu...","[arzt, wenig, unmog, leb, find, ,, unfreund, ,...","[arzt, wenigsten, unmöglich, leben, finden, ,,..."


In [37]:
#%%time
#save_dataframe(data_aug, "data/german_doctor_reviews_augmented_tokenized.parq")