In [1]:
from datasets import load_dataset

training_data = load_dataset("financial_phrasebank", "sentences_allagree")
valitdation_data = load_dataset("financial_phrasebank", "sentences_75agree")

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
tokenizer.pad_token = '~' 

def preprocess_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True, return_tensors="tf")

train_tokens = training_data.map(preprocess_function, batched=True)

val_tokens = valitdation_data.map(preprocess_function, batched=True)
val_tokens

Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3453
    })
})

In [41]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [42]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 4
batches_per_epoch = len(train_tokens["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

batches_per_epoch

141

In [43]:
id2label = {0: "negative", 2: "positive", 1: "neutral"}
label2id = {"negative": 0, "positive": 2,"neutral": 1}

In [58]:
from transformers import TFAutoModelForSequenceClassification, TFOpenAIGPTForSequenceClassification

model = TFOpenAIGPTForSequenceClassification.from_pretrained(
    "openai-gpt", num_labels=3, id2label=id2label, label2id=label2id, problem_type="multi_label_classification",
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFOpenAIGPTForSequenceClassification: ['h.1.attn.bias', 'h.4.attn.bias', 'h.6.attn.bias', 'h.2.attn.bias', 'h.8.attn.bias', 'h.11.attn.bias', 'h.7.attn.bias', 'h.10.attn.bias', 'h.5.attn.bias', 'h.0.attn.bias', 'h.3.attn.bias', 'h.9.attn.bias']
- This IS expected if you are initializing TFOpenAIGPTForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFOpenAIGPTForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFOpenAIGPTForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You sho

In [62]:
tf_train_set = model.prepare_tf_dataset(

    train_tokens["train"],

    shuffle=True,

    batch_size=1,

    collate_fn=data_collator,

)

tf_validation_set= model.prepare_tf_dataset(

    valitdation_data.map(preprocess_function, batched=True)["train"],

    shuffle=True,

    batch_size=1,

    collate_fn=data_collator,

)


In [63]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!

In [10]:
import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [50]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

callbacks = [metric_callback]

## Cannot handle batch sizse bigger then one if no padding token is defiened 

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, callbacks=callbacks,epochs=3)

Epoch 1/3

In [22]:
sample =  "The company still expects its turnover in 2010 to slightly increase from the level of 2009 , adding that `` market predictability is still too poor for trustworthy forecasts on the market development of the contract manufacturing business during the current year '' ."
#@positive 


res = tokenizer(sample, truncation=True, return_tensors="tf", padding=True)

result = model(**res).logits

predicted_class_id = int(tf.math.argmax(result, axis=-1)[0])

model.config.id2label[predicted_class_id]

'negative'

In [27]:
model.save_pretrained("chatGPT", from_tf=True)
tokenizer.save_pretrained("chatGPT", from_tf=True)





INFO:tensorflow:Assets written to: all_agree.tf/assets


INFO:tensorflow:Assets written to: all_agree.tf/assets


In [28]:
#hello = tf.saved_model.load("all_agree.tf")