In [None]:
import transformers
import pandas as pd
import datasets
import random
from transformers import AutoTokenizer, create_optimizer
from torch.utils.data import Dataset
import torch
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoModel
import numpy as np
from sklearn import metrics

In [None]:
# model_checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model_checkpoint = "prajjwal1/bert-small"
num_labels = 2
batch_size = 16

id2label = {0: "negative", 1: "positive"}
label2id = {val: key for key, val in id2label.items()}

In [None]:
tweets_neg = pd.read_csv(
    "../twitter-datasets/train_neg.txt",
    sep="\t",
    lineterminator="\n",
    encoding="utf8",
    names=["tweet"],
)
tweets_pos = pd.read_csv(
    "../twitter-datasets/train_pos.txt",
    sep="\t",
    lineterminator="\n",
    encoding="utf8",
    names=["tweet"],
)

tweets_neg["label"] = "negative"
tweets_pos["label"] = "positive"
tweets = pd.concat([tweets_neg, tweets_pos])

tweets = tweets.sample(frac=0.05).reset_index(drop=True)

train_size = 0.8
train_dataset = tweets.sample(frac=train_size, random_state=200)
test_dataset = tweets.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

train_dataset.head()

In [None]:
dataset = datasets.DatasetDict(
    {
        "train": datasets.Dataset.from_pandas(
            train_dataset,
            features=datasets.Features(
                {
                    "tweet": datasets.Value("string"),
                    "label": datasets.ClassLabel(num_classes=2, names=['negative', 'positive'], names_file=None, id=None)
                }
            )
        ),
        "validation": datasets.Dataset.from_pandas(
            test_dataset,
            features=datasets.Features(
                {
                    "tweet": datasets.Value("string"),
                    "label": datasets.ClassLabel(num_classes=2, names=['negative', 'positive'], names_file=None, id=None)
                }
            )
        ),
    }
)

dataset["train"][0]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess(examples):
    return tokenizer(examples["tweet"], truncation=True)

pre_tokenizer_columns = set(dataset["train"].features)
encoded_dataset = dataset.map(preprocess, batched=True)
tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
print("Columns added by tokenizer:", tokenizer_columns)
encoded_dataset["train"].features["label"]

In [None]:
# model = TFAutoModelForSequenceClassification.from_pretrained(
#     model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
# )

model = AutoModel.from_pretrained(model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id)

In [None]:

tf_train_dataset = model.prepare_tf_dataset(
    encoded_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    tokenizer=tokenizer
)

tf_validation_dataset = model.prepare_tf_dataset(
    encoded_dataset["validation"],
    shuffle=False,
    batch_size=batch_size,
    tokenizer=tokenizer,
)

In [None]:
num_epochs = 1
batches_per_epoch = len(encoded_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)
model.compile(optimizer=optimizer)

In [None]:
from transformers.keras_callbacks import KerasMetricCallback


# def compute_metrics(eval_predictions):
#     predictions, labels = eval_predictions
#     predictions = predictions[:, 0]
#     return metric.compute(predictions=predictions, references=labels)


# metric_callback = KerasMetricCallback(
#     metric_fn=compute_metrics, eval_dataset=tf_validation_dataset
# )

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=num_epochs,
)

In [None]:
output = model.predict(tf_validation_dataset)
classifications = np.argmax(output.logits, axis=1)
accuracy = metrics.accuracy_score(dataset["validation"]["label"], classifications)
print(f"Accuracy Score = {accuracy}")