# Pretraining with and without sentiment
***
research question: If we train a model solely on facts without any notion of good/bad, can the pretraining still alleviate the gap observed for random BERT?

In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from datasets import Dataset

from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import transformers
from datasets import load_dataset
import datasets
import torch

from ailignment.datasets.util import get_accuracy_metric
from ailignment.datasets.moral_stories import make_action_classification_dataframe, get_random_value_dataset
import ailignment.datasets.moral_stories_clustered as msc
from ailignment.training import sequence_classification

pd.set_option('display.max_colwidth', 400)

## Preparing the dataset
***
Steps:
1. Extract sentences from wikitext using spacy
2. Run sentiment analyzer on the sentences
3. Filter out those with a lot of sentiment and those without any.
4. Save both corpuses

In [22]:
dataset = load_dataset("wikitext", 'wikitext-2-raw-v1')
min_sentence_len=150
# merge splits into one single dataset?
# datasets.concatenate_datasets(list(dataset.values()))

Reusing dataset wikitext (/home/kiehne/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20)


  0%|          | 0/3 [00:00<?, ?it/s]

In [23]:
# get rid of short lines
min_token = 1500
dataset = dataset.filter(lambda x: len(x["text"].strip())>=min_token)

Loading cached processed dataset at /home/kiehne/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20/cache-b7c067b1c46e7b00.arrow
Loading cached processed dataset at /home/kiehne/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20/cache-4426706ea9c19aa2.arrow
Loading cached processed dataset at /home/kiehne/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20/cache-891f737d3c0c97a7.arrow


In [24]:
import spacy

nlp=spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7fba9c40bd00>

In [25]:
data = dataset["test"]

def extract_sentences(data):
    # given a huggingface dataset with a "text" column,
    # find all sentences in there and return a new dataset
    docs = nlp.pipe(data["text"], batch_size=32)
    sentences = [x.text for y in docs for x in y.sents]
    return {"sentences":sentences}

In [26]:
sentences = dataset.map(extract_sentences, batched=True, remove_columns="text")

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [34]:
# run this cell to add bookcorpus sentences
books = load_dataset("bookcorpus")
books = books["train"].rename_column("text", "sentences")
books = books.filter(lambda x: len(x["sentences"])>=25)

Reusing dataset bookcorpus (/home/kiehne/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/44662c4a114441c35200992bea923b170e6f13f2f0beb7c14e43759cec498700)


  0%|          | 0/1 [00:00<?, ?it/s]

In [53]:
books = books.filter(lambda x: len(x["sentences"])>=25)

  0%|          | 0/74005 [00:00<?, ?ba/s]

In [60]:
a

Dataset({
    features: ['sentences'],
    num_rows: 59313984
})

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = "finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji


In [8]:
# tokenize everything...
def convert_hf_dataset(tokenizer, data, padding="max_length"):
    def tok(samples):
        return tokenizer(samples["sentences"], padding=padding, 
                         truncation=True, return_token_type_ids=True)

    tok_data = data.map(tok, batched=True)
    return tok_data

In [9]:
tok_sents = convert_hf_dataset(tokenizer, sentences)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/89 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [10]:
training_args = TrainingArguments(
    output_dir="/data/kiehne/results/shuffled_values/trash/",
    per_device_eval_batch_size=128,)
trainer = Trainer(
    model=model,
    args=training_args,
)

In [11]:
neutral_sents = datasets.DatasetDict()
sentiment_sents = datasets.DatasetDict()
neutral_threshold = 0.95
neg = 0; neu=1; pos=2

for split in sentences.keys():

    r = trainer.predict(tok_sents[split])
    scores = torch.softmax(torch.from_numpy(r.predictions), 1, torch.float32)
    ns = sentences[split].filter(lambda x,i: i> scores[i,neu]>=neutral_threshold, with_indices=True)
    ss = sentences[split].filter(lambda x,i: i> scores[i,neu]<(1-neutral_threshold), with_indices=True)
    neutral_sents[split] = ns
    sentiment_sents[split] = ss

print("Neutral:", neutral_sents)
print("With Sentiment:", sentiment_sents)
    


The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentences.
***** Running Prediction *****
  Num examples = 10680
  Batch size = 256


  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentences.
***** Running Prediction *****
  Num examples = 88215
  Batch size = 256


  0%|          | 0/89 [00:00<?, ?ba/s]

  0%|          | 0/89 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentences.
***** Running Prediction *****
  Num examples = 9309
  Batch size = 256


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Neutral: DatasetDict({
    test: Dataset({
        features: ['sentences'],
        num_rows: 4097
    })
    train: Dataset({
        features: ['sentences'],
        num_rows: 30354
    })
    validation: Dataset({
        features: ['sentences'],
        num_rows: 3561
    })
})
With Sentiment: DatasetDict({
    test: Dataset({
        features: ['sentences'],
        num_rows: 240
    })
    train: Dataset({
        features: ['sentences'],
        num_rows: 2352
    })
    validation: Dataset({
        features: ['sentences'],
        num_rows: 200
    })
})


In [12]:
neutral_sents.save_to_disk("/data/kiehne/results/shuffled_values/wiki-neutral/")
sentiment_sents.save_to_disk("/data/kiehne/results/shuffled_values/wiki-sentiment/")

# Running BERT pretraining
***
