In [None]:
# !pip install transformers[torch] datasets evaluate

In [None]:
import os
import pandas as pd

dfs = []
for dirname, _, filenames in os.walk(RUNWAY_DATA_PATH):
    for filename in filenames:
        if filename.endswith(".csv"):
            d = pd.read_csv(os.path.join(dirname, filename))
        elif filename.endswith(".parquet"):
            d = pd.read_parquet(os.path.join(dirname, filename))
        else:
            raise ValueError("Not valid file type")
        dfs += [d]
df = pd.concat(dfs)

In [None]:
from datasets import Dataset

ds = Dataset.from_pandas(df.sample(100))
ds.set_format("pt")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# model
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ARCH_NAME, num_labels=2, id2label=id2label, label2id=label2id
)
model.config.pad_token_id = model.config.eos_token_id

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ARCH_NAME)
tokenizer.pad_token_id = tokenizer.eos_token_id

# cuda setting if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
ds_proc = ds.map(lambda x: tokenizer(x["text"], truncation=True))

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding


training_args = TrainingArguments(
    output_dir="tmp",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_proc,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

history = trainer.train()

In [None]:
import pandas as pd


class HuggingModel:
    def __init__(self, pipeline):
        self.pipeline = pipeline
    
    def predict(self, X):
        result = self.pipeline(X["text"].to_list())
        return pd.DataFrame.from_dict(result)

In [None]:
from transformers import pipeline


model = model.to("cpu")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

hug_model = HuggingModel(pipe)

In [None]:
import runway

runway.start_run()
runway.log_metrics(history.metrics)

input_sample = df.sample(1).drop(columns=["label"])
runway.log_model(model_name="my-text-model", model=hug_model, input_samples={"predict": input_sample})
runway.stop_run()