In [1]:
import os
import numpy as np
import pandas as pd

from transformers import pipeline, set_seed
import spacy

NLP = spacy.load('pl_core_news_sm')

In [2]:
# example of text extending with language model:

generator = pipeline('text-generation', model='flax-community/papuGaPT2')
generator('Największym polskim poetą był', max_len=5)[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Największym polskim poetą był Krzysztof Kamil Baczyński i jego teksty były tłumaczone na różne języki, m.in. na angielski, francuski, węgierski, polski, niemiecki czy rosyjski. Baczyński przetłumaczył też wiele innych utworów napisanych przez Ba'

In [3]:
RANDOM_STATE = 42
DATA_FOLDER = "data/"

#input data files
INPUT_FILE_X = "X_train.csv"
INPUT_FILE_y = "y_train.csv"


# output data files (after preprosessing and splitting)
X_TRAIN_OUTPUT = "X_train_augmented.csv"
Y_TRAIN_OUTPUT = "y_train_augmented.csv"

X_train = pd.read_csv(os.path.join(DATA_FOLDER, INPUT_FILE_X), index_col=0)
y_train = pd.read_csv(os.path.join(DATA_FOLDER, INPUT_FILE_y), index_col=0).iloc[:,0]

In [4]:
def augment_training_data(X, y, generator, target_counts="equal"):
    
    """
    Function augments text data in the following manner:
     - for each text in X
     - generates follow-up text - takes original as start point and 
             generate some words being a contiunuation of tex with language model `generator`
     - finally we use all orignal texts and artificially extended ones
     
    Params
    -----
     - X: array with strings: array with text data to augment
     - y: array with ints: labels of texts
     - generator: language model form HuggingFace; used to generate text
     - target_counts: dictionary - specifies the final amount of each class after augmentation
     
    Returns
     - Tuple of augmented X and y
    """
    
    
    y_counts = np.unique(y, return_counts=True)
    
    if target_counts == "equal": 
        most_frequent_frequency = np.max(y_counts[1])
        target_counts = {label:most_frequent_frequency for label in y_counts[0]}
    
    augmented_X = []
    augmented_y = []
    
    for label, counts in zip(*y_counts):
        
        current_class_texts = list(X[y==label])
        n = len(current_class_texts)
        
        augmented_X.extend(current_class_texts)
        augmented_y.extend([label]*n)
        
        if counts != target_counts[label]:
            
            i = 0          
            while i + n < target_counts[label]:
                augmented_X.append(generator(current_class_texts[i % n][0])[0]['generated_text'])
                augmented_y.append(label)
                i += 1
               
    return np.array(augmented_X), np.array(augmented_y)

In [15]:
X_train_augmented, y_train_augmented = augment_training_data(X_train, y_train, 
                          generator, 
                          target_counts={0: np.sum(y_train==0), 
                                         1: 1000, 
                                         2: 1000})

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [16]:
X_train_augmented_2 = pd.DataFrame(X_train_augmented, columns=["text"])
y_train_augmented_2 = pd.DataFrame(y_train_augmented, columns=["label"])

In [17]:
def lemmatize(x):
    doc = NLP(x) 
    return " ".join([token.lemma_ for token in doc])

X_train_augmented_2["text_lemmatized"] = X_train_augmented_2.text.apply(lemmatize)

In [18]:
X_train_augmented_2.to_csv(os.path.join(DATA_FOLDER, X_TRAIN_OUTPUT))
y_train_augmented_2.to_csv(os.path.join(DATA_FOLDER, Y_TRAIN_OUTPUT))

Komentarz:

Tak po fakcie myślę, że dobrze byłoby też dogenerować trochę zer, ponieważ model może się teraz uczyć, że mowa nienawiści to ta, która jest sztuczna.