In [1]:
import torch
if not torch.cuda.is_available():
    ValueError('Switch to GPU!')

In [None]:
!pip install -q git+https://github.com/nikitakapitan/nlphub.git
!pip install -q datasets transformers
!pip install -q umap-learn
!pip install -q ipython-autotime

In [2]:
import os   # check/load files
import json # pprint dict

import sklearn
import sklearn.dummy
import datasets
import numpy as np

from huggingface_hub import notebook_login

import transformers # Hugging Face transformers

from nlphub import vizual
from nlphub import hidden_state
from nlphub import metrics
from nlphub import errors

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%load_ext autotime
%autoreload 2

time: 2.13 ms (started: 2023-01-24 18:51:28 +01:00)


In [None]:
# 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
print("Login to Hugging Face Hub:")
notebook_login()

In [None]:
VIZUAL = True
HIDDEN_STATE_AS_FEATURES = True
ERROR_ANALYSIS = True
CHECKPOINT = 'distilbert-base-uncased'


dataset = datasets.load_dataset('emotion')
print('Step 1. Load DATA :', dataset['train'].builder_name)

VIZUAL and vizual.output_distribution(dataset=dataset)


tokenizer = transformers.DistilBertTokenizer.from_pretrained(CHECKPOINT)
tokenize = lambda batch : tokenizer(batch['text'], padding=True)
print('Step 2. Loaded TOKENIZER :', type(tokenizer))


dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)
print('OK. DATA is encoded by TOKENIZER')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
print('OK. Loaded DEVICE :', device.type)

In [None]:
# ~ 4 min GPU
print('Optional 1 : HIDDEN_STATE as Features analysis...')
if HIDDEN_STATE_AS_FEATURES:
    
    model = transformers.DistilBertModel.from_pretrained(CHECKPOINT)
    model.to(device)
    print('... optional 1. Loaded MODEL for feature extraction :', type(model))
    
    if os.path.exists('emos_hidden'):
        dataset_hidden = datasets.load_from_disk('emos_hidden') 
    else:
        print(f'... optinal 1. Getting Hidden_state for {len(dataset_encoded["train"])} examples ~ 3 min')
        dataset_hidden = hidden_state.get_hidden_state(data_encoded=dataset_encoded, model=model, tokenizer=tokenizer, device=device)

    labels = dataset_hidden["train"].features["label"].names
    X_train, X_valid, y_train, y_valid = hidden_state.prepare_data(data_hidden=dataset_hidden)

    # UMAP2D projection
    VIZUAL and vizual.plot_umap(X_train=X_train, y_train=y_train, labels=labels)

    # Dummy [Most Frequent] Classification
    dummy_clf = sklearn.dummy.DummyClassifier(strategy='most_frequent')
    dummy_clf.fit(X_train, y_train)

    print('... optional 1. Dummy [Most Frequent] Classifier score:', dummy_clf.score(X_valid, y_valid))

    print('... optional 1. Making LogisticRegression Classification (GPU: ~2 min / CPU: ~20 min)')
    lr_clf = sklearn.linear_model.LogisticRegression(max_iter=3000)
    lr_clf.fit(X_train, y_train)
    
    print('... optional 1. LogReg trained on last hidden state score:', lr_clf.score(X_valid, y_valid))

    VIZUAL and vizual.plot_confusion_matrix(y_preds=lr_clf.predict(X_valid), y_true=y_valid, labels=labels)








In [None]:
print('*★*:;;;;;:*★*:;;;;;:*★* FINE-TUNNING BERT *★*:;;;;;:*★*:;;;;;:*★*')

num_labels = 6
# AutoModel4SeqClass adds untrained head for classification on top of ref_model feature extractors. use for init ONLY (not import)
model = transformers.AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels=num_labels)
model = model.to(device)
print('Step 3. Loaded MODEL for classification:', type(model))

batch_size = 64
logging_steps = len(dataset_encoded['train']) // batch_size
model_name = f'{CHECKPOINT}-finetuned-{dataset["train"].builder_name}'

training_args = transformers.TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy='epoch', # evaluation at the end of each epoch
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=True,
    log_level="error",
    )

trainer = transformers.Trainer(
    model = model,
    args = training_args,
    compute_metrics = metrics.compute_metrics,
    train_dataset = dataset_encoded['train'],
    eval_dataset = dataset_encoded['validation'],
    tokenizer=tokenizer,
)

In [None]:
# ~ 4 min GPU
trainer.train();

In [None]:
# type(preds_output) = PredictionOutput
preds_output = trainer.predict(dataset_encoded['validation'])
print('... Result 3 : validation metrics', json.dumps(preds_output.metrics, indent = 4))

# greedy predictions
y_preds = np.argmax(preds_output.predictions, axis=1)

VIZUAL and vizual.plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
print('*★*:;;;;;:*★*:;;;;;:*★* ERROR ANALYSIS *★*:;;;;;:*★*:;;;;;:*★*')

""""
Recall: 
if model.config.problem_type == "regression":
    loss_fct = MSELoss()
elif model.config.problem_type == "single_label_classification":
    loss_fct = CrossEntropyLoss()
elif model.config.problem_type == "multi_label_classification":
    loss_fct = BCEWithLogitsLoss()
"""
df = None
if ERROR_ANALYSIS:
    df = errors.error_analysis(dataset_encoded, model=model, device=device, tokenizer=tokenizer)
df

In [None]:
print('SAVING MODEL TO HUB')
trainer.push_to_hub(commit_message='Training completed')

In [None]:
print('Load model and Make predictions')

custom_tweet = "I saw a movie today and it was really good."

model_id = "nikitakapitan/distilbert-base-uncased-finetuned-emotion"

classifier = transformers.pipeline("text-classification", model=model_id)
preds = classifier(custom_tweet, return_all_scores=True)

VIZUAL and vizual.plt_bar(preds, labels)

