In [None]:
# !pip install transformers[torch] datasets evaluate

In [None]:
import os
import pandas as pd

dfs = []
for dirname, _, filenames in os.walk(RUNWAY_DATA_PATH):
    for filename in filenames:
        if filename.endswith(".csv"):
            d = pd.read_csv(os.path.join(dirname, filename))
        elif filename.endswith(".parquet"):
            d = pd.read_parquet(os.path.join(dirname, filename))
        else:
            raise ValueError("Not valid file type")
        dfs += [d]
df = pd.concat(dfs)

In [None]:
from datasets import Dataset

ds = Dataset.from_pandas(df.head(100))
ds.set_format("pt")

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer = AutoTokenizer.from_pretrained(MODEL_ARCH_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_ds = ds.map(preprocess_function, batch_size=True)

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = "cuda" if torch.cuda.is_available() else "cpu"
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ARCH_NAME, num_labels=2, id2label=id2label, label2id=label2id
).to(device)

In [None]:
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir="tmp",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
import pandas as pd


class HuggingModel:
    def __init__(self, pipeline):
        self.pipeline = pipeline
    
    def predict(self, X):
        result = self.pipeline(X["text"].to_list())
        return pd.DataFrame.from_dict(result)

In [None]:
from transformers import pipeline


model = model.to("cpu")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

hug_model = HuggingModel(pipe)

In [None]:
input_sample = df.sample(1).drop(columns=["label"])

In [None]:
import runway

runway.log_model(model_name='my-model', model=hug_model, input_samples={'predict': input_sample})
