In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from grammar_ru.corpus import CorpusReader, CorpusBuilder
from diplom.utils.corpus_utils import CorpusFramework
from diplom.utils.dialog_markuper import DialogMarkupFeaturizer
#from diplom.utils.speech_action_maker import SpeechActionFeaturizer
import matplotlib.pyplot as plt
from torch.nn.functional import cosine_similarity
from collections import defaultdict
import torch
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
torch.cuda.get_device_properties(0)

In [3]:
device = torch.device('cuda:0')
path_corpus = Path(f"../data/corpora/diplom.wow.zip")
corpus = CorpusReader(path_corpus)
corpus_framework = CorpusFramework(corpus)
authors = corpus.get_toc().author.unique()

In [4]:
text_corpus = pd.read_csv('../text_corpus.csv')

labels = text_corpus['action'].unique().tolist()
labels = [s.strip() for s in labels ]

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}
#there is deleting said words
text_corpus = text_corpus.loc[text_corpus.action != 'said']

text_corpus["labels"]=text_corpus.action.map(lambda x: label2id[x.strip()])
text_corpus = text_corpus.drop(['sample_id','action'], axis=1).rename({'speech':'text'},axis=1)
NUM_LABELS= text_corpus.labels.nunique()

labels


In [5]:
from sklearn.utils.class_weight import compute_class_weight
y = text_corpus['labels'].values
class_weights= torch.from_numpy(compute_class_weight('balanced',classes=np.unique(y),y=y)).float().to(device)

In [6]:
class_weights.shape

In [7]:
from transformers import AutoTokenizer

MAX_LEN = 512
TRAIN_BATCH_SIZE = 4 # 4 is totaly work fine
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-06#1e-05
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')

In [8]:
class MyDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        title = str(self.data.text[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.labels[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [9]:
train_size = 0.8
train_dataset=text_corpus.sample(frac=train_size,random_state=200)
test_dataset=text_corpus.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(text_corpus.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = MyDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = MyDataset(test_dataset, tokenizer, MAX_LEN)

In [10]:
training_set.data.labels.nunique()

In [11]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0
               }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [12]:
from transformers import AutoModel

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")#AutoModel.from_pretrained("prajjwal1/bert-small")
        self.pre_classifier = torch.nn.Linear(768, 768)#torch.nn.Linear(512, 768)#
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, NUM_LABELS)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [13]:
model = DistillBERTClass()
model.to(device)

In [14]:
from transformers import get_linear_schedule_with_warmup

loss_function = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
#scheduler = get_linear_schedule_with_warmup(optimizer, 3, 10)

In [15]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

def apk(actual, predicted, k=5):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if len(actual) == 0:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=5):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])



In [16]:
def train(model, epoch,per_step=1000):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    apk_sum = 0
    model.train()
    k = 0
    for _, data in enumerate(tqdm(training_loader), 0):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)
        apk_sum += apk(big_idx, targets, k=5)
        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if _ % per_step == 0 and _ != 0:
            loss_step = tr_loss / nb_tr_steps
            accu_step = (n_correct * 100) / nb_tr_examples
            print(f"Loss over {per_step * k} steps: {loss_step}")
            print(f"Accuracy over {per_step* k} steps: {accu_step}")
            print(f"MAP@5over {per_step* k} steps: {apk_sum /nb_tr_examples}")#(per_step* k)
            k += 1

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct * 100) / nb_tr_examples}')
    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    print(f"MAP@5over Epoch: {apk_sum /nb_tr_examples}")
    return

In [17]:
def valid(model, testing_loader,per_step=100):
    tr_loss = 0
    apk_sum = 0
    k = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    # mb comment
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(tqdm(testing_loader), 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)
            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            apk_sum += apk(big_idx, targets, k=5)
            if _%per_step==0 and _ != 0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per {per_step* k} steps: {loss_step}")
                print(f"Validation Accuracy per {per_step* k} steps: {accu_step}")
                print(f"MAP@5over {per_step* k} steps: {apk_sum /nb_tr_examples}")
                k += 1
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    epoch_map = apk_sum /nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    print(f"MAP@5over Epoch: {apk_sum /nb_tr_examples}")
    return epoch_accu,epoch_map

In [18]:
EPOCHS = 5

In [19]:
for epoch in range(EPOCHS):
    train(model,epoch,per_step=1300)
    valid(model,testing_loader,per_step=700)
    #scheduler.step()

In [54]:
acc,map = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

In [62]:
out[4]

In [52]:
NUM_LABELS

In [44]:
[id2label[ans-1] for ans in answers]

In [22]:
output_model_file = './models/first_distilbert.bin'
output_vocab_file = './models/first_vocab_distilbert.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

In [23]:
from transformers import pipeline, DistilBertForSequenceClassification
loaded_model = DistilBertForSequenceClassification.from_pretrained(output_model_file)
predict_label = pipeline('sentiment-analysis', model=loaded_model, tokenizer=tokenizer)