# Анализ сантимента (эмоциональной окраски) твитов.

(Kaggle kernel)

Импорт и установка необходимых библиотек.

In [None]:
!pip install '/kaggle/input/simple-transformers-pypi/seqeval-0.0.12-py3-none-any.whl' -q
!pip install '/kaggle/input/simple-transformers-pypi/simpletransformers-0.22.1-py3-none-any.whl' -q

In [None]:
import re
import gc
import time
import string
import random
import datetime
import itertools
import collections
import h5py
import typing
import nltk
import json
import sklearn
import transformers
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup,  RobertaModel, RobertaTokenizer
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.multiprocessing import Pipe, Process
from torch.utils import data
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids
from allennlp.commands.elmo import ElmoEmbedder
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import spacy
import warnings
from sklearn.utils import shuffle
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences as pad

warnings.filterwarnings("ignore")

In [None]:
!mkdir data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
def debug_memory():
    import collections, gc, resource, torch
    print('maxrss = {}'.format(
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
    tensors = collections.Counter((str(o.device), o.dtype, tuple(o.shape))
                                  for o in gc.get_objects()
                                  if torch.is_tensor(o))
    for line in sorted(tensors.items()):
        print('{}\t{}'.format(*line))

In [None]:
# Объявление функций, необходимых для использования spacy
# Grac https://www.kaggle.com/rohitsingh9990/ner-training-using-spacy-ensemble
def save_model(output_dir, nlp, new_model_name):
    """
    Saves model given the directory where to save, model and the name that should be used for this model
    """
    output_dir = f'../working/{output_dir}'
    if output_dir is not None:       # In case some sad shit happens  
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        
        
def train(train_data, output_dir, n_iter=20, model=None):
    """
    Load the model, set up the pipeline and train the entity recognizer.
    """
    if model is not None:
        nlp = spacy.load(output_dir)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()

        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,   # dropout - make it harder to memorise data
                    losses=losses, 
                )
            
            print("Losses", losses)
    save_model(output_dir, nlp, 'st_ner')
    
    
def get_model_out_path(sentiment:str) -> str:
    """
    Simply sets model path based on the sentiment
    """
    model_out_path = None
    if sentiment == 'positive':
        model_out_path = 'models/model_pos'
    elif sentiment == 'negative':
        model_out_path = 'models/model_neg'
    else:
        model_out_path = 'models/model_neu'
    return model_out_path

def get_training_data(train_df, sentiment:str) -> list:
    """
    Prepairs data in acceptible by spacy models format
    """
    train_data = []
    for index, row in train_df.iterrows():
        if row.sentiment == sentiment:
            selected_text = str(row.selected_text)
            text = str(row.text)
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return train_data

def print_metric(data, batch, epoch, start, end, metric, typ):
    t = typ, metric, "%s", data, "%s"
    if typ == "Train": pre = "BATCH " + str(batch-1) + " "
    if typ == "Val": pre = "\nEPOCH " + str(epoch+1) + " "
    time = np.round(end - start, 1); time = "Time: %s{}%s s".format(time)
    
    print(pre  + "{} {}: {}{}{}".format(*t)  + "  " + time )
    
    
def find_all(input_str, search_str):
    l1 = []
    length = len(input_str)
    index = 0
    while index < length:
        i = input_str.find(search_str, index)
        if i == -1:
            return l1
        l1.append(i)
        index = i + 1
    return l1

def do_qa_train(train):

    output = []
    for line in train:
        context = line[1]

        qas = []
        question = line[-1]
        qid = line[0]
        answers = []
        answer = line[2]
        if type(answer) != str or type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answer_starts = find_all(context, answer)
        for answer_start in answer_starts:
            answers.append({'answer_start': answer_start, 'text': answer.lower()})
            break
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})

        output.append({'context': context.lower(), 'qas': qas})
        
    return output

def do_qa_test(test):
    output = []
    for line in test:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output




In [None]:
def basic_cleaning(text):
    text=re.sub(r'https?://www\.\S+\.com','',text)
    text=re.sub(r'[^A-Za-z|\s]','',text)
    return text

def clean(df):
    for col in ['text','selected_text']:
        df[col]=df[col].astype(str).apply(lambda x:basic_cleaning(x))
    return df

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
sns.set_style("darkgrid")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Импортируем данные для обучения модели.
train_df = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")

In [None]:
df = train_df

In [None]:
df.head()

In [None]:
# Уникальные эмоциональные окраски текста:
df['sentiment'].unique()

In [None]:
fig = plt.figure(figsize=(16,10))
sns.categorical.countplot(df['sentiment'])
plt.title("Distribution of number of tweets prior to sentiment");

Наблюдается некоторый дисбаланс классов.

In [None]:
groups = df.groupby('sentiment')['text']

In [None]:
fig = plt.figure(figsize=(16,10))
for sentiment in df['sentiment'].unique():
    sns.distplot(groups.get_group(sentiment).apply(lambda item: len(item) if type(item) == str else len(str(item))),
                hist=False,
                 kde=True,
                 bins=20,
                hist_kws={'edgecolor':'black'},
                kde_kws={'linewidth': 4},
                label=sentiment);
#plt.axvline(np.quantile(df['text'].apply(lambda item: len(item) if type(item) == str else len(str(item))), 0.05), 0, 17, linewidth=4, color="red")
#plt.axvline(np.quantile(df['text'].apply(lambda item: len(item) if type(item) == str else len(str(item))), 0.95), 0, 17, linewidth=4, color="red")
plt.title('Distribution KDE of number of characters in tweets with respect to sentiment')
plt.xlabel('Lengths of tweets')
plt.ylabel('Density');

In [None]:
sentiments = df['sentiment'].unique()

In [None]:
fig,ax = plt.subplots(1,3,figsize=(12,6))
for i, sentiment in enumerate(sentiments):
    sns.distplot(groups.get_group(sentiment).str.split().str.len(), ax=ax[i], hist=True, kde=True)
    ax[i].set_ylim(0, 0.06)
    ax[i].set_title(sentiment)

Как видим, пока что не было замечено никаких аномалий в данных.

In [None]:
puncts = set(string.punctuation)

df['punct'] = df['text'].apply(lambda row: list(filter(lambda x: x in puncts, str(row))))
groups_1 = df.groupby('sentiment')['punct']
fig,ax = plt.subplots(1, 3,figsize=(40,10))
for axis, sentiment in zip(ax, df['sentiment'].unique()):
    sns.distplot(groups_1.get_group(sentiment).apply(len),
                hist=False,
                 kde=True,
                 bins=20,
                hist_kws={'edgecolor':'black'},
                kde_kws={'linewidth': 4},
                ax=axis)
    axis.set_xlabel("Number of punctuations", fontsize=18)
    axis.set_ylabel("Density", fontsize=18)
    axis.set_title(sentiment.capitalize(), fontsize=22)

Легко заметить, что в текстах с позитивным сантиментом встречается больше знаков пунктуации.

# **Попробуем найти закономерности в том, какие элементы чаще всего встречаются в твитах: проведем частотный анализ по знакам пунктуации и токенам.**

In [None]:
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers import SingleIdTokenIndexer

# the token indexer is responsible for mapping tokens to integers
token_indexer = SingleIdTokenIndexer()

def tokenizer(x: str):
    return [w.text for w in
            SpacyWordSplitter(language='en_core_web_sm', 
                              pos_tags=False).split_words(x)]

In [None]:
df['tokens'] = df['text'].apply(lambda row:  str(row).split())

In [None]:
groups2 = df.groupby("sentiment")

## Распределение знаков пунктуации:

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(22, 10))
for axis, sentiment in zip(ax, sentiments):
    x, y = zip(*collections.Counter(list(itertools.chain.from_iterable(groups2.get_group(sentiment)['punct'].tolist()))).most_common())
    axis.bar(x, y)
    axis.set_xlabel("Punctuation signs")
    axis.set_ylabel("Number of occurences")
    axis.set_title(sentiment.capitalize())

# Распределение всех символов:

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(22, 10))
for axis, sentiment in zip(ax, sentiments):
    x, y = zip(*collections.Counter(list(itertools.chain.from_iterable(groups2.get_group(sentiment)['tokens'].tolist()))).most_common()[:20])
    axis.bar(x, y)
    axis.set_xlabel("Number of occurences")
    axis.set_ylabel("Words")
    axis.set_title(sentiment.capitalize())

# Распределение слов:

In [None]:
df['tokens_no_punct'] = df['text'].apply(lambda row: tokenizer(basic_cleaning(str(row))))

In [None]:
groups3 = df.groupby("sentiment")

In [None]:
def preprocess_tweets(dataframe: pd.core.frame.DataFrame, n:int = 3, column: str = 'text') -> list:
    return dataframe[column].apply(lambda row: (lambda arr: list(filter(lambda x: len(x) >= n and len(puncts.intersection(x)) == 0, arr)))(tokenizer(str(row)))).tolist()

## При наименьшей длинне слова, равной 3м символам:

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(22, 10))
for axis, sentiment in zip(ax, sentiments):
    #x, y = zip(*collections.Counter(list(itertools.chain.from_iterable(groups3.get_group(sentiment)['tokens_no_punct'].tolist()))).most_common()[:20])
    x, y = zip(*collections.Counter(itertools.chain.from_iterable(preprocess_tweets(groups3.get_group(sentiment), n=3))).most_common()[:20])
    sns.barplot( x = y, y = list(x), ax=axis)
    axis.set_xlabel("Number of occurences")
    axis.set_ylabel("Words")
    axis.set_title(sentiment.capitalize())

## При наименьшей длинне слова, равной 4м символам:

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(22, 10))
for axis, sentiment in zip(ax, sentiments):
    #x, y = zip(*collections.Counter(list(itertools.chain.from_iterable(groups3.get_group(sentiment)['tokens_no_punct'].tolist()))).most_common()[:20])
    x, y = zip(*collections.Counter(itertools.chain.from_iterable(preprocess_tweets(groups3.get_group(sentiment), n=4))).most_common()[:20])
    sns.barplot( x = y, y = list(x), ax=axis)
    axis.set_xlabel("Number of occurences")
    axis.set_ylabel("Words")
    axis.set_title(sentiment.capitalize())

## При наименьшей длинне слова, равной 5ти символам:

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(22, 10))
for axis, sentiment in zip(ax, sentiments):
    #x, y = zip(*collections.Counter(list(itertools.chain.from_iterable(groups3.get_group(sentiment)['tokens_no_punct'].tolist()))).most_common()[:20])
    x, y = zip(*collections.Counter(itertools.chain.from_iterable(preprocess_tweets(groups3.get_group(sentiment), n=5))).most_common()[:20])
    sns.barplot( x = y, y = list(x), ax=axis)
    axis.set_xlabel("Number of occurences")
    axis.set_ylabel("Words")
    axis.set_title(sentiment.capitalize())

Можно видеть, что при увеличении длинны слов становится видна разница между оттенками наиболее часто встречающийся слов.

Теперь, когда мы взглянули на данные, предобработаем данные и построим модель для решения задачи.

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, x, y ):
        self.sentences = x
        self.target = torch.tensor(y).long()
    def __len__(self) -> int:
        return len(self.target)
        
    def __getitem__(self, index:int) -> tuple:
        x = self.sentences[index]
        y = self.target[index]
        
        return (x, y[0])
    
        

In [None]:
class ELMOSentenceClassifier(nn.Module):
    """
    A simple classifier based on ELMo transformer
    """
    def __init__(self, dropout=0.5):
        
        super(ELMOSentenceClassifier, self).__init__()
        self.dropout = dropout
        self.device = torch.device("cpu") #torch.device("cuda:0" if torch.cuda.is_available() else "cpu") Пизда полнейшая, ебаный торч не хочет работать с гпу от кегла
        options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
        weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
        self.elmo = Elmo(options_file, weight_file, 3, dropout=0)

        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 3)
        
        
        
    def forward(self, sentences):
        """
        Input:
        ::param sentences - list(of lists())// Tensor / numpy array, containing a batch of tokenized sentences.
        """
        x = batch_to_ids(sentences) 
        x = x.to(self.device)
        x = self.elmo(x)
        x = (x['elmo_representations'][0] + x['elmo_representations'][1] + x['elmo_representations'][2])/3 # хуй знает почему, но тут не получается усреднять с помощью torch.mean, ебись в рот создатели AllenNLP
        x = self.fc1(x)
        x = self.fc2(x)
        x = F.softmax(x, dim=0) # TODO: понять, на кой ляд здесь dim; без него не работает
        #print(len(x), x[0].size())
        return x[0]


In [None]:
def train_model(model, train_tuple, test_tuple, n_epochs,  learning_rate=1e-4, param_dict = {'batch_size': 64,'shuffle': True,'num_workers': 1}):
    device = torch.device("cpu")#torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"starting training on device: {device}")
    train_dataset = CustomDataset(*train_tuple)
    test_dataset = CustomDataset(*test_tuple)
    train_dataloader = data.DataLoader(train_dataset, **param_dict)
    test_dataloader = data.DataLoader(test_dataset, **param_dict)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn. CrossEntropyLoss()
    start = time.time()
    train_losses = []
    test_losses = []
    for epoch in range(n_epochs):
    
        print(f"[{time.time() - start}] On epoch {epoch}")
        running_loss = 0.0
        i = 0
        for local_features, local_target in train_dataloader:
            optimizer.zero_grad()
            local_target.to(device)
            outputs = model(local_features)
            outputs.to(device)
            #print(local_target.size())
            loss = criterion(outputs , local_target )
            running_loss += loss
            loss.backward()
            optimizer.step()
            i += 1
            if i % 10 == 0:
                
                print(f"[{time.time() - start}] Epoch {epoch}, step {i}/{len(train_dataloader)}, loss: {running_loss/i}")
        train_losses.append(running_loss/len(train_dataloader))
        #print(f"[{time.time() - start}] Train loss: {running_loss/len(train_dataloader)}")  
        with torch.set_grad_enabled(False):
            test_loss = 0.0
            
            for local_features, local_target in test_dataloader:
                outputs = model(local_features)
                outputs.to(device)
                local_target.to(device)
                
                loss = criterion(outputs , local_target)
                test_loss += loss
            test_losses.append(test_loss/len(test_dataloader))
            print(f"[{time.time() - start}] Test loss: {test_loss/len(test_dataloader)}")
        
    return model, train_losses, test_lossss

In [None]:
"""model, train_losses, test_losses = train_model(model=ELMOSentenceClassifier(dropout=0.0),
                    train_tuple=(x_train, y_train),
                    test_tuple=(x_test, y_test),
                   n_epochs=10,
                   )"""

# Модель BERT требует специфичной предобработки данных.

df['tokens'].apply(lambda row: len(row)).max()

bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenizer_param_dict={
    "add_special_tokens": True, # Add '[CLS]' and '[SEP]'
    "max_length": 106,  # Pad & truncate all sentences.
    "pad_to_max_length": True,
    "return_attention_mask": True,   # Construct attn. masks.
    "return_tensors": 'pt'     # Return pytorch tensors.
}

sent_ids = []
sent_masks = []

for sentence in df['text']:
    encoded_dict = bertTokenizer.encode_plus(str(sentence), add_special_tokens=True, max_length=106, return_attention_mask=True, pad_to_max_length=True,return_tensors='pt' )
    sent_ids.append(encoded_dict['input_ids'])
    sent_masks.append(encoded_dict['attention_mask'])
    
sent_ids = torch.cat(sent_ids, dim=0)
sent_masks = torch.cat(sent_masks, dim=0)


labels = sklearn.preprocessing.OrdinalEncoder().fit_transform(df['sentiment'].to_numpy().reshape(-1,1))

labels = torch.tensor(labels).long()

dataset = torch.utils.data.TensorDataset(sent_ids, sent_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

batch_size = 32


train_dataloader = torch.utils.data.DataLoader(
            train_dataset,  # The training samples.
            sampler = torch.utils.data.RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
validation_dataloader = torch.utils.data.DataLoader(
            val_dataset, # The validation samples.
            sampler = torch.utils.data.SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification. 
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 1e-4, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


# В оригинальной статье 
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

def one_hot(y):
    tmp = np.zeros(3)
    tmp[y[0]] = 1
    return tmp

Следуя рекоммендациям авторов оригинальной статьи о BERT, выберем размер минибатча, равный 32.

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        #print(b_labels.size())
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # are given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        #print(b_labels.size())
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        #print(list(map(lambda item: one_hot(item), label_ids)))
        total_eval_accuracy += sklearn.metrics.f1_score(label_ids, np.argmax(logits, axis=1), average='weighted')
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  F1-score: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

# Теперь, когда мы позанимались какой-то хуйней, давайте сделаем что-то полезное - а именно, прикрутим два NERа к этой задаче; для neutral будем давать само предложение.

In [None]:
groups = df.groupby('sentiment')

In [None]:
df_pos = groups.get_group('positive')
df_neg = groups.get_group('negative')
df_neu = groups.get_group('neutral')

In [None]:
arr = np.array([1, 2, 3 ,4 , 5])

In [None]:
set([1, 3, 4]).intersection(set([1]))

In [None]:
def calculate_jaccard_score(sents, masks, targets):
    """
    ::param sents - array of initial sentences, tokenized.

    """
    assert len(sents) == len(masks) and len(targets) == len(masks), f"Input arrays lengths do not match, got {len(sents)}, {len(masks)}, {len(targets)}"
    
    dists = []
    for sentence, source_mask, target_mask in zip(sents, np.asarray(masks, dtype=bool), np.asarray(targets, dtype=bool)):
        source_set = set(np.asarray(sentence)[source_mask[:len(sentence)]])
        target_set = set(np.asarray(sentence)[target_mask[:len(sentence)]])
        denominator = len(source_set.union(target_set))
        if denominator == 0:
            dists.append(1)
        else:
            dists.append(len(source_set.intersection(target_set))/denominator)
    return np.mean(dists)

In [None]:
def make_int_mask(source_array,target_array, max_len=106):
    mask = [0]*max_len
    for i in range(max_len):
        if i < len(source_array) and source_array[i] in target_array:
            mask[i] = 1
            
    return mask
        

In [None]:
def prepair_dataloaders(dataframe, test_size=0.1, batch_size=32):
    sent_ids = []
    sent_masks = []
    targets = []
    robertaTokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)
    for sentence, target in zip(dataframe['text'], dataframe['selected_text']):
        encoded_dict = robertaTokenizer.encode_plus(str(sentence), add_special_tokens=True, max_length=54, return_attention_mask=True, pad_to_max_length=True,return_tensors='pt' )
        sent_ids.append(encoded_dict['input_ids'])
        sent_masks.append(encoded_dict['attention_mask'])
        targets.append(make_int_mask(sentence.split(), target.split(), max_len=54))
    sent_ids = torch.cat(sent_ids, dim=0)
    sent_masks = torch.cat(sent_masks, dim=0)
    targets = torch.tensor(targets).long()
    dataset = torch.utils.data.TensorDataset(sent_ids, sent_masks, targets)
    
    
    
    train_size = int((1-test_size) * len(dataset))
    val_size = len(dataset) - train_size
    
    
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    print('{:>5,} training samples'.format(train_size))
    print('{:>5,} validation samples'.format(val_size))
    
    train_dataloader = torch.utils.data.DataLoader(
            train_dataset,  # The training samples.
            sampler = torch.utils.data.RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
    validation_dataloader = torch.utils.data.DataLoader(
            val_dataset, # The validation samples.
            sampler = torch.utils.data.SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
    
    return train_dataloader, validation_dataloader, robertaTokenizer

Сначала тренируем модель для позитивных примеров.

In [None]:
train_dataloader, validation_dataloader, tokenizer = prepair_dataloaders(df_pos)

In [None]:
model = transformers.RobertaForTokenClassification.from_pretrained('roberta-base', output_attentions=False, output_hidden_states=False )
model.cuda()

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
# В оригинальной статье сказано 2-4
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
flag = True
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        #print(b_labels.size())
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # are given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        #print(b_labels.size())
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        


            (loss, outputs) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        outputs = outputs.detach().cpu()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        #print(list(map(lambda item: one_hot(item), label_ids)))
        predictions = torch.argmax(outputs, dim=2)
        if flag:
            flag = False
            #print(predictions)
        predictions = predictions.cpu().numpy()
        source_sents = list(map(lambda sent: list(map(lambda item: item.lower(), tokenizer.decode(sent, skip_special_tokens=True).split())), b_input_ids))
        #print(source_sents)
        total_eval_accuracy += calculate_jaccard_score(source_sents, predictions, label_ids) #sklearn.metrics.f1_score(label_ids, np.argmax(logits, axis=1), average='weighted')
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Jaccard score: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

И проделаем то же самое с негативным классом.

In [None]:
model_pos = model

In [None]:
train_dataloader, validation_dataloader, tokenizer = prepair_dataloaders(df_neg)

In [None]:
model = transformers.RobertaForTokenClassification.from_pretrained('roberta-base', output_attentions=False, output_hidden_states=False )
model.cuda()

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
# В оригинальной статье сказано 2-4
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
flag = True
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        #print(b_labels.size())
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # are given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        #print(b_labels.size())
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        


            (loss, outputs) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        outputs = outputs.detach().cpu()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        #print(list(map(lambda item: one_hot(item), label_ids)))
        predictions = torch.argmax(outputs, dim=2)
        if flag:
            flag = False
            #print(predictions)
        predictions = predictions.cpu().numpy()
        source_sents = list(map(lambda sent: list(map(lambda item: item.lower(), tokenizer.decode(sent, skip_special_tokens=True).split())), b_input_ids))
        #print(source_sents)
        total_eval_accuracy += calculate_jaccard_score(source_sents, predictions, label_ids) #sklearn.metrics.f1_score(label_ids, np.argmax(logits, axis=1), average='weighted')
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Jaccard score: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
model_neg = model

In [None]:
def clf(text ,sentiment):
    with torch.no_grad():
        if sentiment == "positive":
            encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=54, return_attention_mask=True, pad_to_max_length=True,return_tensors='pt' )
            #print(inds)
            input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
            labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) 
            outputs = model_pos(input_ids.cuda(), labels=labels.cuda())
            
            predictions = torch.argmax(outputs[1], dim=2).cpu()[0]
            #print(predictions)
            output_labels = " ".join(np.asarray(text.split())[np.asarray(predictions.cpu(), dtype=bool)[:len(text.split())]])
            return output_labels.strip()
        elif sentiment == 'negative':
            encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=54, return_attention_mask=True, pad_to_max_length=True,return_tensors='pt' )
            #print(inds)
            input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
            labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) 
            outputs = model_neg(input_ids.cuda(), labels=labels.cuda())
            
            predictions = torch.argmax(outputs[1], dim=2).cpu()[0]
            #print(predictions)
            output_labels = " ".join(np.asarray(text.split())[np.asarray(predictions.cpu(), dtype=bool)[:len(text.split())]]).strip()
            return output_labels
        else:
            return text.strip()

In [None]:
(lambda i: clf(df['text'][i],df['sentiment'][i] ))(12), df['selected_text'][12], df['sentiment'][12]

In [None]:
test_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

In [None]:
preds =list(map(lambda i: clf(test_df.iloc[i]['text'], test_df.iloc[i]['sentiment']), range(len(test_df)) ))

In [None]:
test_df.iloc[1]['text']

In [None]:
sample = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')

In [None]:
sample.loc[:, 'selected_text'] = preds

In [None]:
sample.to_csv("submission.csv", index=False)

## Попробуем другую модель из семейства BERT - roBERT от Facebook AI.
Также, немного улучшим класс Датасета.

In [None]:
class TweetDataset(torch.utils.data.Dataset):
    """
    Улучшенный датасет.
    Параметры:
    ::param data - датафрейм с данными
    ::param tokenizer - токкенайзер, который будет применяться к тексту
    
    """
    def __init__(self, data, tokenizer):
        self.data = data
        self.text = data.text
        self.tokenizer = tokenizer
        self.sentiment = data.sentiment
        self.sentiment_dict = {"positive": 0, "neutral": 1, "negative": 2}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i:int) -> tuple:
        start, finish = 0, 2
        pg, tg = 'post', 'post'
        tweet = str(self.text[i]).strip()
        tweet_ids = self.tokenizer.encode(tweet)

        attention_mask_idx = len(tweet_ids) - 1
        if start not in tweet_ids: tweet_ids = start + tweet_ids
        tweet_ids = pad([tweet_ids], maxlen=MAXLEN, value=1, padding=pg, truncating=tg)

        attention_mask = np.zeros(MAXLEN)
        attention_mask[1:attention_mask_idx] = 1
        attention_mask = attention_mask.reshape((1, -1))
        if finish not in tweet_ids: tweet_ids[-1], attention_mask[-1] = finish, start
            
        sentiment = [self.sentiment_dict[self.sentiment[i]]]
        sentiment = torch.FloatTensor(to_categorical(sentiment, num_classes=3))
        return sentiment, torch.LongTensor(tweet_ids), torch.LongTensor(attention_mask)

In [None]:
class Roberta(nn.Module):
    """
    An implementation of roBERTa model.
    It 
    """
    def __init__(self):
        super(Roberta, self).__init__()
        self.softmax = nn.Softmax(dim=1)
        self.drop = nn.Dropout(DROP_RATE)
        self.roberta = RobertaModel.from_pretrained(model)
        self.dense = nn.Linear(ROBERTA_UNITS, OUTPUT_UNITS)
        
    def forward(self, inp, att):
        inp = inp.view(-1, MAXLEN)
        _, self.feat = self.roberta(inp, att)
        return self.softmax(self.dense(self.drop(self.feat)))

In [None]:
EPOCHS = 20
SPLIT = 0.8
MAXLEN = 48
DROP_RATE = 0.3
np.random.seed(42)

OUTPUT_UNITS = 3
BATCH_SIZE = 256
LR = (4e-5, 1e-2)
ROBERTA_UNITS = 768
VAL_BATCH_SIZE = 384
MODEL_SAVE_PATH = 'sentiment_model.pt'

In [None]:
model = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model)

In [None]:
def cel(inp, target):
    _, labels = target.max(dim=1)
    return nn.CrossEntropyLoss()(inp, labels)*len(inp)

def accuracy(inp, target):
    inp_ind = inp.max(axis=1).indices
    target_ind = target.max(axis=1).indices
    return (inp_ind == target_ind).float().sum(axis=0)

In [None]:
def train_roberta():
    size = 1
    torch.manual_seed(42)
    train_df = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

    train_df = shuffle(train_df)
    split = np.int32(SPLIT*len(train_df))
    val_df, train_df = train_df[split:], train_df[:split]

    val_df = val_df.reset_index(drop=True)
    val_set = TweetDataset(val_df, tokenizer)
    val_sampler = torch.utils.data.RandomSampler(val_set)

    train_df = train_df.reset_index(drop=True)
    train_set = TweetDataset(train_df, tokenizer)
    train_sampler = torch.utils.data.RandomSampler(train_set)
    
    val_loader = torch.utils.data.DataLoader(val_set,
                            batch_size = VAL_BATCH_SIZE,
                            sampler=val_sampler)

    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size = BATCH_SIZE,
                              sampler=train_sampler)

    device = torch.device("cuda")
    network = Roberta().to(device)
    optimizer = optim.Adam([{'params': network.dense.parameters(), 'lr': LR[1]*size},
                      {'params': network.roberta.parameters(), 'lr': LR[0]*size}])

    val_losses, val_accuracies = [], []
    train_losses, train_accuracies = [], []
    
    start = time.time()
    for epoch in range(EPOCHS):

        batch = 1
        network.train()
        #fonts = (fg(48), attr('reset'))
        #xm.master_print(("EPOCH %s" + str(epoch+1) + "%s") % fonts)

        #val_parallel = pl.ParallelLoader(val_loader, [device]).per_device_loader(device)
        #train_parallel = pl.ParallelLoader(train_loader, [device]).per_device_loader(device)
        
        for train_batch in train_loader:
            train_targ, train_in, train_att = train_batch
            
            network = network.to(device)
            train_in = train_in.to(device)
            train_att = train_att.to(device)
            train_targ = train_targ.to(device)

            train_preds = network.forward(train_in, train_att)
            train_loss = cel(train_preds, train_targ.squeeze(dim=1))/len(train_in)
            train_accuracy = accuracy(train_preds, train_targ.squeeze(dim=1))/len(train_in)

            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            
            end = time.time()
            batch = batch + 1
            acc = np.round(train_accuracy.item(), 3)
            print(f"Training accuracy on batch {batch}: {np.round(acc, 3)}, time: {np.round(end - start, 1)}s")
            #print_metric(acc, batch, None, start, end, metric="acc", typ="Train")

        val_loss, val_accuracy, val_points = 0, 0, 0

        network.eval()
        with torch.no_grad():
            for val_batch in val_loader:
                targ, val_in, val_att = val_batch

                targ = targ.to(device)
                val_in = val_in.to(device)
                val_att = val_att.to(device)
                network = network.to(device)
            
                val_points += len(targ)
                pred = network.forward(val_in, val_att)
                val_loss += cel(pred, targ.squeeze(dim=1)).item()
                val_accuracy += accuracy(pred, targ.squeeze(dim=1)).item()
        
        end = time.time()
        val_loss /= val_points
        val_accuracy /= val_points
        #acc = xm.mesh_reduce('acc', val_accuracy, lambda x: sum(x)/len(x))
        print(f"Validation accuracy: {np.round(np.mean(val_accuracy), 3)}")
        #print_metric(np.round(acc, 3), None, epoch, start, end, metric="acc", typ="Val")
    
        print("")
        val_losses.append(val_loss)
        train_losses.append(train_loss.item())
        val_accuracies.append(val_accuracy) 
        train_accuracies.append(train_accuracy.item())

    print("ENDING TRAINING ...")
    torch.save(network.state_dict(), MODEL_SAVE_PATH); del network; gc.collect()

    metric_names = ['val_loss_', 'train_loss_', 'val_acc_', 'train_acc_']
    metric_lists = [val_losses, train_losses, val_accuracies, train_accuracies]
    
    for i, metric_list in enumerate(metric_lists):
        for j, metric_value in enumerate(metric_list):
            torch.save(metric_value, metric_names[i] + str(j) + '.pt')

In [None]:
train_roberta()

Для выделения слов, образующих и определяющих эмоциональную окраску предложений, мы будем использовать модель DistilBERT, предобученную на датасете SQuAD. Для лаконичности, будем использовать обертку над библиотекой transformers от HuggingFace, которая называется simpletransformers.

Для начала, переведем наши данные в подходящий для тренировки вопросно-ответной системы формат вида: 
```
train_data = [
    {
        'context': "This tweet sentiment extraction challenge is great",
        'qas': [
            {
                'id': "00001",
                'question': "positive",
                'answers': [
                    {
                        'text': "is great",
                        'answer_start': 43
                    }
                ]
            }
        ]
    }
    ]
```

In [None]:
train_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
sub_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')

train = np.array(train_df)
test = np.array(test_df)

qa_train = do_qa_train(train)

with open('data/train.json', 'w') as outfile:
    json.dump(qa_train, outfile)
    
qa_test = do_qa_test(test)

with open('data/test.json', 'w') as outfile:
    json.dump(qa_test, outfile)

In [None]:
from simpletransformers.question_answering import QuestionAnsweringModel

MODEL_PATH = '/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/'

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel('distilbert', 
                               MODEL_PATH, 
                               args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 5e-5,
                                     'num_train_epochs': 3,
                                     'max_seq_length': 192,
                                     'doc_stride': 64,
                                     'fp16': False,
                                    },
                              use_cuda=True)

model.train_model('data/train.json')

In [None]:
predictions = model.predict(qa_test)
predictions_df = pd.DataFrame.from_dict(predictions)