In [1]:
!pip install 'fhnw-nlp-utils>=0.2.8,<0.3'

from fhnw.nlp.utils.processing import Preprocessor
from fhnw.nlp.utils.processing import parallelize_dataframe
from fhnw.nlp.utils.storage import download
from fhnw.nlp.utils.storage import load_dataframe

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
download("https://drive.google.com/uc?id=17nFv7PKC6YJttZT4D1txk4CCUgduq3pc", "data/german_doctor_reviews_original.parq")
data = load_dataframe("data/german_doctor_reviews_original.parq")

Here are two functions that preprocess text (as placeholder for any other text preprocessing function).
The function **tokenize** only access one field of a row (i.e. it processes a single columns) whereas **tokenize_by_row** could potentially access all fields of a row (as an example of a more advanced function that needs to access several columns (per row) for its computation).

In contrast to the previous version there is no need for a function that handles the dataframe anymore (i.e. no **tokenize_df** or the like since this is handled internally).

In [3]:
def tokenize(text, stopwords):
    """Tokenizes a text and removes stopwords

    Parameters
    ----------
    text : str, iterable
        The text either as string or iterable of tokens (in this case tokenization is not applied)
    stopwords : set
        A set of stopword to remove from the tokens
        
    Returns
    -------
    list
        The tokenized text
    """
    from nltk.tokenize import word_tokenize
    from fhnw.nlp.utils.processing import is_iterable

    if isinstance(text, str):
        word_tokens = word_tokenize(text)
    elif is_iterable(text):
        word_tokens = text
    else:
        raise TypeError("Only string or iterable (e.g. list) is supported. Received a "+ str(type(text)))

    return [word.lower() for word in word_tokens if word.lower() not in stopwords]

def tokenize_by_row(row, stopwords):
    """Tokenizes a text and removes stopwords

    Parameters
    ----------
    row : a row of the pandas Dataframe (as a Series)
    stopwords : set
        A set of stopword to remove from the tokens
        
    Returns
    -------
    list
        The tokenized text
    """
    from nltk.tokenize import word_tokenize

    word_tokens = word_tokenize(row["text_original"])
    return [word.lower() for word in word_tokens if word.lower() not in stopwords]

In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words("german"))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The first example uses a function (i.e. **tokenize**) that accesses a single field of a row (i.e. one column).

In [6]:
%%time
data_test = data.head(50000)
data_test = parallelize_dataframe(data_test, tokenize, stopwords=stopwords, field_read="text_original", field_write="tokenized")
data_test.head(3)

CPU times: user 1.28 s, sys: 170 ms, total: 1.45 s
Wall time: 6.06 s


Unnamed: 0,text_original,rating,tokenized
0,Ich bin franzose und bin seit ein paar Wochen ...,2.0,"[franzose, seit, paar, wochen, muenchen, ., za..."
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,"[arzt, unmöglichste, leben, je, begegnet, ,, u..."
2,Hatte akute Beschwerden am Rücken. Herr Magura...,1.0,"[akute, beschwerden, rücken, ., herr, magura, ..."


The second example uses a function (i.e. **tokenize_by_row**) that potentially could access several fields of a row (note that the parameter *field_read* is not provided in this example which acts as the switch between the *single field* and the *complete row* mode).

In [7]:
%%time
data_test = data.head(50000)
data_test = parallelize_dataframe(data_test, tokenize_by_row, stopwords=stopwords, field_write="tokenized")
data_test.head(3)

CPU times: user 1.82 s, sys: 250 ms, total: 2.07 s
Wall time: 7.42 s


Unnamed: 0,text_original,rating,tokenized
0,Ich bin franzose und bin seit ein paar Wochen ...,2.0,"[ich, bin, franzose, und, bin, seit, ein, paar..."
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,"[dieser, arzt, ist, das, unmöglichste, was, mi..."
2,Hatte akute Beschwerden am Rücken. Herr Magura...,1.0,"[hatte, akute, beschwerden, am, rücken, ., her..."


This can be wrapped into a generic Preprocessor *Estimator* that can be used in a scikit-learn [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html). 

During inference (i.e. running live data in production) it can be handy to build a complete *processing pipeline*. This generic Preprocessor allows to re-use the same functions/setup as it was used during training without further modifications (simple to use and less error prone).

In [8]:
%%time

preprocessor = Preprocessor(tokenize, stopwords=stopwords, field_read="text_original", field_write="tokenized")
data_test = data.head(50000)
data_test = preprocessor.transform(data_test)
data_test.head(3)

CPU times: user 1.37 s, sys: 239 ms, total: 1.61 s
Wall time: 6.44 s


Unnamed: 0,text_original,rating,tokenized
0,Ich bin franzose und bin seit ein paar Wochen ...,2.0,"[franzose, seit, paar, wochen, muenchen, ., za..."
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,"[arzt, unmöglichste, leben, je, begegnet, ,, u..."
2,Hatte akute Beschwerden am Rücken. Herr Magura...,1.0,"[akute, beschwerden, rücken, ., herr, magura, ..."


In [9]:
%%time

preprocessor = Preprocessor(tokenize_by_row, stopwords=stopwords, field_write="tokenized")
data_test = data.head(50000)
data_test = preprocessor.transform(data_test)
data_test.head(3)

CPU times: user 1.96 s, sys: 329 ms, total: 2.29 s
Wall time: 7.58 s


Unnamed: 0,text_original,rating,tokenized
0,Ich bin franzose und bin seit ein paar Wochen ...,2.0,"[ich, bin, franzose, und, bin, seit, ein, paar..."
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,"[dieser, arzt, ist, das, unmöglichste, was, mi..."
2,Hatte akute Beschwerden am Rücken. Herr Magura...,1.0,"[hatte, akute, beschwerden, am, rücken, ., her..."
