In [17]:
from openml import datasets

# Fetch dataset by ID
dataset = datasets.get_dataset(31)

# Get the data in pandas DataFrame format
X, _, _, _ = dataset.get_data()

# drop fnlwgt column
# X = X.drop("fnlwgt", axis=1)

# Convert DataFrame to list of dictionaries
data = X.to_dict("records")

# get the name of the target label
target_field = dataset.default_target_attribute

print(f"Number of instances: {len(data)}")
print(f'Target field for classification: "{target_field}"')
print(f"Example instance:\n{data[0]}")

Number of instances: 1000
Target field for classification: "class"
Example instance:
{'checking_status': '<0', 'duration': 6, 'credit_history': 'critical/other existing credit', 'purpose': 'radio/tv', 'credit_amount': 1169.0, 'savings_status': 'no known savings', 'employment': '>=7', 'installment_commitment': 4, 'personal_status': 'male single', 'other_parties': 'none', 'residence_since': 4, 'property_magnitude': 'real estate', 'age': 67, 'other_payment_plans': 'none', 'housing': 'own', 'existing_credits': 2, 'job': 'skilled', 'num_dependents': 1, 'own_telephone': 'yes', 'foreign_worker': 'yes', 'class': 'good'}


In [None]:
from sklearn.model_selection import train_test_split

from origami.preprocessing import build_prediction_pipelines, docs_to_df
from origami.utils import set_seed
from origami.utils.config import TopLevelConfig

# for reproducibility
set_seed(123)

# load data into "docs" column in dataframe and split into train/test
df = docs_to_df(data)
train_docs_df, test_docs_df = train_test_split(df, test_size=0.2, shuffle=True)

config = TopLevelConfig()

# pipeline config
config.pipeline.upscale = 100
config.pipeline.sequence_order = "SHUFFLED"

# create train and test pipelines
pipelines = build_prediction_pipelines(pipeline_config=config.pipeline, target_field=target_field, verbose=True)

# process train, eval and test data
train_df = pipelines["train"].fit_transform(train_docs_df)
test_df = pipelines["test"].transform(test_docs_df)

# get stateful objects
schema = pipelines["train"]["schema"].schema
encoder = pipelines["train"]["encoder"].encoder
block_size = pipelines["train"]["padding"].length

# print data stats
print(f"len train: {len(train_df)}, len test: {len(test_df)}")
print(f"vocab size {encoder.vocab_size}")
print(f"block size {block_size}")


train pipeline: Pipeline(steps=[('binning',
                 KBinsDiscretizerPipe(strategy='kmeans', threshold=100)),
                ('target', TargetFieldPipe(target_field='class')),
                ('schema', SchemaParserPipe()),
                ('tokenizer', DocTokenizerPipe()),
                ('padding', PadTruncTokensPipe()),
                ('encoder', TokenEncoderPipe(max_tokens=0))],
         verbose=True)
test pipeline: Pipeline(steps=[('binning',
                 KBinsDiscretizerPipe(strategy='kmeans', threshold=100)),
                ('target', TargetFieldPipe(target_field='class')),
                ('tokenizer', DocTokenizerPipe()),
                ('padding', PadTruncTokensPipe()),
                ('encoder', TokenEncoderPipe(max_tokens=0))],
         verbose=True)
[Pipeline] ........... (step 1 of 6) Processing binning, total=   0.2s
[Pipeline] ............ (step 2 of 6) Processing target, total=   0.0s
[Pipeline] ............ (step 3 of 6) Processing schema, total=   0

In [None]:
from origami.model import ORIGAMI
from origami.model.vpda import ObjectVPDA
from origami.preprocessing import DFDataset
from origami.utils import count_parameters

# wrap dataframes in datasets
train_dataset = DFDataset(train_df)
test_dataset = DFDataset(test_df)

# model config
config.model.n_layer = 4
config.model.n_head = 4
config.model.n_embd = 64
config.model.vocab_size = encoder.vocab_size
config.model.block_size = block_size

# create PDA and pass it to the model
vpda = ObjectVPDA(encoder, schema)
model = ORIGAMI(config.model, config.train, vpda=vpda)

n_params = count_parameters(model)
print(f"Number of parameters: {n_params / 1e6:.2f}M")

Number of parameters: 0.86M


In [20]:
from origami.inference import Predictor
from origami.utils import make_progress_callback

# create a predictor
predictor = Predictor(model, encoder, target_field)

# create and register progress callback
progress_callback = make_progress_callback(
    config.train, train_dataset=train_dataset, test_dataset=test_dataset, predictor=predictor
)
model.set_callback("on_batch_end", progress_callback)

# train model
model.train_model(train_dataset, batches=10000)

|  step: 0  |  epoch: 0  |  batch_num: 0  |  batch_dt: 0.00  |  batch_loss: 2.7793  |  lr: 1.01e-06  |  train_acc: 0.0000  |  test_loss: 2.7882  |  test_acc: 0.0000  |
|  step: 1  |  epoch: 12  |  batch_num: 100  |  batch_dt: 75.38  |  batch_loss: 0.9688  |  lr: 1.01e-04  |
|  step: 2  |  epoch: 25  |  batch_num: 200  |  batch_dt: 72.75  |  batch_loss: 0.6550  |  lr: 2.01e-04  |
|  step: 3  |  epoch: 37  |  batch_num: 300  |  batch_dt: 72.20  |  batch_loss: 0.6319  |  lr: 3.01e-04  |
|  step: 4  |  epoch: 50  |  batch_num: 400  |  batch_dt: 73.25  |  batch_loss: 0.5899  |  lr: 4.01e-04  |
|  step: 5  |  epoch: 62  |  batch_num: 500  |  batch_dt: 73.57  |  batch_loss: 0.5710  |  lr: 5.01e-04  |
|  step: 6  |  epoch: 75  |  batch_num: 600  |  batch_dt: 73.87  |  batch_loss: 0.5103  |  lr: 6.01e-04  |
|  step: 7  |  epoch: 87  |  batch_num: 700  |  batch_dt: 74.45  |  batch_loss: 0.4682  |  lr: 7.01e-04  |
|  step: 8  |  epoch: 100  |  batch_num: 800  |  batch_dt: 73.61  |  batch_loss: 0.

In [15]:
# calculate test accuracy
acc = predictor.accuracy(test_dataset, show_progress=True)
print(f"Test accuracy: {acc:.4f}")

# we can also access the predictions with the `predict()` method
predictions = predictor.predict(test_dataset)
print("Model predictions (first 10): ", predictions[:10])
print("Correct labels (first 10): ", test_dataset.df["target"].to_list()[:10])

Predicting:   0%|          | 0/2 [00:00<?, ?it/s]

Test accuracy: 0.9948
Model predictions (first 10):  ['positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative']
Correct labels (first 10):  ['positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative']
