In [1]:
import glob
import os
import re
import sqlite3
import time
from typing import Any, Dict, List, Optional

import openai
from openai.error import Timeout as OpenAITimeout
import pandas as pd
from pandas.io.sql import DatabaseError  # type: ignore
from requests.exceptions import Timeout as RequestsTimeout
import sqlglot
from sqlglot import expressions, Expression
import tiktoken


SET = "bird-data-train"
JSON_FILE = "./" + SET + "/train.json"
DB_DIR = "./" + SET + "/train_databases"

In [2]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

In [3]:
def compute_tokens(row):
    msgs = [{"role": "user", "content": row['user_prompt']}]
    return num_tokens_from_messages(msgs)

def get_create_table_and_data(db_path: str, num_rows: int = 5) -> str:
    MAX_TOKENS = 3550 # The limit required so OpenAI doesn't complain after we reformat
    while num_rows >= 0:
        # Connect to the database
        conn = sqlite3.connect(db_path, timeout=30)
        cursor = conn.cursor()

        # Query the sqlite_master table to get the CREATE TABLE statements
        cursor.execute(
            "SELECT name, sql FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
        )
        tables = cursor.fetchall()

        output_statements = []

        for table_name, create_statement in tables:
            # "INTEGER" -> "INT"
            create_statement = create_statement.replace("INTEGER", "INT")
            
            # remove comments
            create_statement = re.sub(r'--.*$', '', create_statement, flags=re.MULTILINE)
            create_statement = "\n".join([line for line in create_statement.split("\n") if line.strip()])

            # Condense whitespace
            create_statement = " ".join(create_statement.split())
            
            # First, add the create statement
            output_statements.append(create_statement + ";")

            # Fetch sample data
            cursor.execute(f"SELECT * FROM `{table_name}` LIMIT ?", (num_rows,))
            sample_rows = cursor.fetchall()

            # For each row, create an INSERT INTO statement
            for row in sample_rows:
                formatted_values = []
                for idx, value in enumerate(row):
                    if isinstance(value, str):
                        formatted_value = value.replace('\n', ' ')
                        formatted_value = formatted_value.replace("'", '"')
                        formatted_value = formatted_value[:100]
                        formatted_values.append(f"'{formatted_value}'")
                    elif value is None:
                        formatted_values.append("NULL")
                    else:
                        formatted_values.append(str(value))
                values_str = ",".join(formatted_values)
            
                # Check if table_name contains a space or dash and wrap it in double quotes if it does
                if ' ' in table_name or '-' in table_name:
                    formatted_table_name = f'"{table_name}"'
                else:
                    formatted_table_name = table_name

                insert_statement = f"INSERT INTO {formatted_table_name} VALUES ({values_str});"
                output_statements.append(insert_statement)

        # Close the database connection
        cursor.close()
        conn.close()

        final_statements = []
        for statement in output_statements:
            test_statements = final_statements + [statement]
            msgs = [{"role": "user", "content": "\n".join(test_statements)}]
            token_count = num_tokens_from_messages(msgs)

            if token_count < MAX_TOKENS:
                final_statements = test_statements
            else:
                break

        if num_rows == 0 or final_statements:
            return final_statements
        else:
            num_rows -= 1
    raise ValueError(f"Even with 0 rows, token count is too high!")


def clean_creates(sql_text: str) -> str:
    """While these fields might be useful for some purposes, I've honestly
    needed them so rarely as a data scientist that we are going to exclude them
    """

    def replace_(node: Expression) -> Optional[Expression]:
        if isinstance(
            node,
            (
                expressions.ColumnConstraint,
                # expressions.ForeignKey,
                expressions.PrimaryKey,
            ),
        ):
            return None
        return node
    return str(sqlglot.parse_one(sql_text).transform(replace_))


def hard_replace__clean_creates(sql_text: str):
    """The backticks and double-quotes are always equivalent in bird
    # but sqlglot cannot yet handle the backticks
    """
    try:
        return clean_creates(
            sql_text.replace("`", '"')
            .replace("WITHOUT ROWID", "")
            .replace("on update cascade","")
            .replace("ON UPDATE CASCADE","")
            .replace("on delete cascade","")
            .replace("ON DELETE CASCADE","")
            .replace("references staff","")
        )  # .sql()
    except Exception:
        print(sql_text)
        raise


def read_in_all_sqlite_dbs():
    """Read in all the sqlite databases from the bird data"""
    dirs = glob.glob(DB_DIR + "/*")
    statements = []
    for d in dirs:
        if os.path.isfile(d):
            continue
        dbname = d.split("/")[-1]
        sqlite_db_path = os.path.join(d, dbname + ".sqlite")
        assert os.path.exists(d), f"DB {d} does not exist!"
        ddl_list = get_create_table_and_data(sqlite_db_path)
        for ddl in ddl_list:
            statements.append(
                (dbname, hard_replace__clean_creates(ddl))
            )

    return statements


def make_x(tables, db_id, ideal_sql):
    """Make the x and y for the training data"""
    return tables, db_id, ideal_sql

In [4]:
ddl_statements = read_in_all_sqlite_dbs()

df_ddl = pd.DataFrame(ddl_statements)
df_ddl.columns = ["db_id", "ddl"]
df_ddl = df_ddl.groupby("db_id")["ddl"].agg(lambda x: "\n".join(x)).reset_index(name="ddl")

def format_ddl(ddl_str):
    formatted_ddls = []

    # Split the ddl_str by "CREATE TABLE"
    create_tables = re.split(r'(?i)CREATE TABLE', ddl_str)

    for ct in create_tables:
        if not ct.strip():
            continue

        # Extract table name from the current CREATE TABLE section
        table_name_match = re.search(r'^\s*("?[\w\s-]+"?|[\w\s-]+)', ct)

        # table_name = table_name_match.group(1) if table_name_match else "Unknown Table"
        table_name = table_name_match.group(1).strip() if table_name_match else "Unknown Table"

        # Split the current section at "INSERT INTO"
        splits = ct.split("INSERT INTO")

        # Extract column names and remove the table name from it
        # columns = splits[0].replace(table_name, "").strip().replace("(", "").replace(")", "")
        columns = re.sub(r'^\s*' + re.escape(table_name), '', splits[0]).strip().replace("(", "").replace(")", "")
        columns = " ".join(columns.split())

        # Process INSERT statements
        cleaned_table_name = table_name.strip("\"")
        insert_statements = [split.replace(f'{cleaned_table_name} VALUES', '').strip() for split in splits[1:]]

        # Remove parentheses from the INSERT statements
        insert_statements = [stmt.replace("(", "").replace(")", "") for stmt in insert_statements]
        
        # Combine the statements for the current table and append to formatted_ddls
        # formatted_ddl = "# Table: " + table_name + "\n" + columns + "\n" + "\n".join(insert_statements)
        if 'VARBINARY' in ct:
            formatted_ddl = table_name + " (" + columns + ");"
        else:
            formatted_ddl = table_name + " (" + columns + ");\nINSERT INTO " + table_name + " VALUES\n(" + ")\n(".join(insert_statements) +");"
        formatted_ddls.append(formatted_ddl)

    # Join all the formatted sections with newline characters
    return "\n".join(formatted_ddls)

# Apply the formatting function to the 'ddl' column
df_ddl['ddl'] = df_ddl['ddl'].apply(format_ddl)

df_question = pd.read_json(JSON_FILE)
joined_df = pd.merge(df_ddl, df_question, on=["db_id"])
joined_df.head()

Unnamed: 0,db_id,ddl,question,evidence,SQL
0,address,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...",What is the total number of households in Arec...,"""ARECIBO"" is the county; total number of house...",SELECT SUM(T1.households) FROM zip_data AS T1 ...
1,address,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...",Which residential area in Arecibo county has t...,"""ARECIBO"" is the county; highest average house...",SELECT T1.zip_code FROM zip_data AS T1 INNER J...
2,address,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...",Please list the numbers of males in all the re...,"""ARECIBO"" is the county; number of males refer...",SELECT SUM(T1.male_population) FROM zip_data A...
3,address,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...","Among all the residential areas in Delaware, h...","""Delaware"" is a county; implement daylight sav...",SELECT COUNT(T1.zip_code) FROM zip_data AS T1 ...
4,address,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...",Among all the residential areas in Arecibo cou...,"""ARECIBO"" is the county; highest white populat...",SELECT T1.zip_code FROM zip_data AS T1 INNER J...


In [5]:
print(joined_df.iloc[0]['ddl'])

CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT);
INSERT INTO CBSA VALUES
(10300, 'Adrian, MI', 'Micro')
(10380, 'Aguadilla-Isabela, PR', 'Metro')
(10420, 'Akron, OH', 'Metro')
(10500, 'Albany, GA', 'Metro')
(10580, 'Albany-Schenectady-Troy, NY', 'Metro');
state (abbreviation TEXT, name TEXT);
INSERT INTO state VALUES
('AA', 'Armed Forces Americas')
('AE', 'Armed Forces Europe')
('AK', 'Alaska')
('AL', 'Alabama')
('AP', 'Armed Forces Pacific');
congress (cognress_rep_id TEXT, first_name TEXT, last_name TEXT, CID TEXT, party TEXT, state TEXT, abbreviation TEXT, House TEXT, District INT, land_area FLOAT, FOREIGN KEY abbreviation REFERENCES state abbreviation);
INSERT INTO congress VALUES
('AK', 'Young', 'Don', 'N00008091', 'Republican', 'Alaska', 'AK', 'House of Repsentatives', NULL, 571951.26)
('AK-S1', 'Begich', 'Mark', 'N00009585', 'Democrat', 'Alaska', 'AK', 'Senate', NULL, 570641.0)
('AK-S2', 'Murkowski', 'Lisa', 'N00033101', 'Republican', 'Alaska', 'AK', 'Senate', NULL, 570641.0)
('

In [6]:
df = pd.DataFrame(
    joined_df.apply(
        lambda x: make_x(
            x["ddl"]
            + "\n## The user has asked:\n"
            + x["question"]
            + "\nNOTE: "
            + x['evidence'],
            x["db_id"],
            x['SQL']
        ),
        axis=1,
    ).tolist()
)
df.columns = [
    "user_prompt",
    "db_id",
    "ideal_assistant_response"
]

# Drop the row that contains the example gold sql
example_gold_sql= "SELECT T1.Rating, COUNT(T2.Sentiment_Polarity) FROM playstore AS T1 INNER JOIN user_reviews AS T2 ON T1.App = T2.App WHERE T1.App = 'Dragon Ball Legends' AND CAST(Sentiment_Polarity AS INTEGER) < -0.5"
df = df[df['ideal_assistant_response'] != example_gold_sql]

df = df.drop_duplicates()

df['tokens'] = df.apply(compute_tokens, axis=1)

df.head()

Unnamed: 0,user_prompt,db_id,ideal_assistant_response,tokens
0,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...",address,SELECT SUM(T1.households) FROM zip_data AS T1 ...,2089
1,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...",address,SELECT T1.zip_code FROM zip_data AS T1 INNER J...,2097
2,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...",address,SELECT SUM(T1.male_population) FROM zip_data A...,2092
3,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...",address,SELECT COUNT(T1.zip_code) FROM zip_data AS T1 ...,2091
4,"CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT...",address,SELECT T1.zip_code FROM zip_data AS T1 INNER J...,2100


In [7]:
# Verify we dropped the example gold sql
df[df['ideal_assistant_response'] == example_gold_sql]

Unnamed: 0,user_prompt,db_id,ideal_assistant_response,tokens


In [8]:
df.sort_values(by='tokens', ascending=False).head()

Unnamed: 0,user_prompt,db_id,ideal_assistant_response,tokens
2618,"AwardsMisc (name TEXT, ID TEXT, award TEXT, ye...",hockey,SELECT CAST(SUM(T2.height) AS REAL) / COUNT(*)...,3867
2603,"AwardsMisc (name TEXT, ID TEXT, award TEXT, ye...",hockey,SELECT SUM(CASE WHEN T1.year = 2006 THEN CAST(...,3861
2634,"AwardsMisc (name TEXT, ID TEXT, award TEXT, ye...",hockey,"SELECT T2.nameGiven, T2.lastName, T2.birthYear...",3855
2623,"AwardsMisc (name TEXT, ID TEXT, award TEXT, ye...",hockey,"SELECT DISTINCT T3.firstNHL - T1.year, T3.name...",3851
2563,"AwardsMisc (name TEXT, ID TEXT, award TEXT, ye...",hockey,SELECT DISTINCT T1.pos FROM Master AS T1 INNER...,3840


In [9]:
print(df.loc[5066]['user_prompt'])

awards_players (playerID TEXT, award TEXT, year INT, lgID TEXT, note TEXT, pos TEXT, FOREIGN KEY playerID REFERENCES players playerID);
INSERT INTO awards_players VALUES
('abdulka01', 'All-Defensive Second Team', 1969, 'NBA', NULL, NULL)
('abdulka01', 'All-NBA Second Team', 1969, 'NBA', NULL, 'C')
('abdulka01', 'Rookie of the Year', 1969, 'NBA', NULL, NULL)
('abdulka01', 'All-Defensive Second Team', 1970, 'NBA', NULL, NULL)
('abdulka01', 'All-NBA First Team', 1970, 'NBA', NULL, 'C');
coaches (coachID TEXT, year INT, tmID TEXT, lgID TEXT, stint INT, won INT, lost INT, post_wins INT, post_losses INT, FOREIGN KEY tmID, year REFERENCES teams tmID, year);
INSERT INTO coaches VALUES
('adelmri01', 1988, 'POR', 'NBA', 2, 14, 21, 0, 3)
('adelmri01', 1989, 'POR', 'NBA', 1, 59, 23, 12, 9)
('adelmri01', 1990, 'POR', 'NBA', 1, 63, 19, 9, 7)
('adelmri01', 1991, 'POR', 'NBA', 1, 57, 25, 13, 8)
('adelmri01', 1992, 'POR', 'NBA', 1, 51, 31, 1, 3);
draft (id INT, draftYear INT, draftRound INT, draftSelec

In [10]:
df['tokens'].sum() / 1000.0 * .008

132.351176

In [11]:
import json

file_name = "fine_tuning_bird_qna_take_two_training_data.jsonl"

with open(file_name, 'w') as outfile:
    for row in df.itertuples():
        json.dump({"messages": [{"role": "user", "content": row.user_prompt}, 
                                {"role": "assistant", "content": row.ideal_assistant_response}]}, outfile)
        outfile.write('\n')

In [12]:
with open(file_name, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 9424
First example:
{'role': 'user', 'content': 'CBSA (CBSA INT, CBSA_name TEXT, CBSA_type TEXT);\nINSERT INTO CBSA VALUES\n(10300, \'Adrian, MI\', \'Micro\')\n(10380, \'Aguadilla-Isabela, PR\', \'Metro\')\n(10420, \'Akron, OH\', \'Metro\')\n(10500, \'Albany, GA\', \'Metro\')\n(10580, \'Albany-Schenectady-Troy, NY\', \'Metro\');\nstate (abbreviation TEXT, name TEXT);\nINSERT INTO state VALUES\n(\'AA\', \'Armed Forces Americas\')\n(\'AE\', \'Armed Forces Europe\')\n(\'AK\', \'Alaska\')\n(\'AL\', \'Alabama\')\n(\'AP\', \'Armed Forces Pacific\');\ncongress (cognress_rep_id TEXT, first_name TEXT, last_name TEXT, CID TEXT, party TEXT, state TEXT, abbreviation TEXT, House TEXT, District INT, land_area FLOAT, FOREIGN KEY abbreviation REFERENCES state abbreviation);\nINSERT INTO congress VALUES\n(\'AK\', \'Young\', \'Don\', \'N00008091\', \'Republican\', \'Alaska\', \'AK\', \'House of Repsentatives\', NULL, 571951.26)\n(\'AK-S1\', \'Begich\', \'Mark\', \'N00009585\', \'Democrat\'

In [13]:
from collections import defaultdict

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [14]:
def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [15]:
import numpy as np

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 9424
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 2, 2
mean / median: 2.0, 2.0
p5 / p95: 2.0, 2.0

#### Distribution of num_total_tokens_per_example:
min / max: 256, 4015
mean / median: 1810.1644736842106, 1686.0
p5 / p95: 609.3000000000001, 3326.7000000000007

#### Distribution of num_assistant_tokens_per_example:
min / max: 4, 278
mean / median: 50.65757640067912, 49.0
p5 / p95: 18.0, 80.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [22]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

PRICE_PER_1K_TOKENS = 0.008

print(f"This will cost ~${n_epochs * n_billing_tokens_in_dataset / 1000 * PRICE_PER_1K_TOKENS:.2f} to train")

Dataset has ~17058990 tokens that will be charged for during training
By default, you'll train for 2 epochs on this dataset
By default, you'll be charged for ~34117980 tokens
This will cost ~$272.94 to train


In [19]:
upload_response = openai.File.create(
  file=open(file_name, "rb"),
  purpose='fine-tune'
)
upload_response

<File file id=file-yt2tzTmOhKmbL4clWJ9ZmWAR at 0x14d770710> JSON: {
  "object": "file",
  "id": "file-yt2tzTmOhKmbL4clWJ9ZmWAR",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 47915791,
  "created_at": 1698802653,
  "status": "processed",
  "status_details": null
}

In [20]:
file_id = upload_response.id
file_id

'file-yt2tzTmOhKmbL4clWJ9ZmWAR'

In [23]:
fine_tune_response = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo", suffix="bird_qna_take_two")
fine_tune_response

<FineTuningJob fine_tuning.job id=ftjob-EPDJqgOoHYfRBV1s727wPwIr at 0x1274d5d90> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-EPDJqgOoHYfRBV1s727wPwIr",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1698802777,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-ePmgB4qVo14GgUKdUQci6IGz",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-yt2tzTmOhKmbL4clWJ9ZmWAR",
  "hyperparameters": {
    "n_epochs": "auto"
  },
  "trained_tokens": null,
  "error": null
}

In [24]:
# Retrieve the state of a fine-tune
openai.FineTuningJob.retrieve("ftjob-EPDJqgOoHYfRBV1s727wPwIr")

<FineTuningJob fine_tuning.job id=ftjob-EPDJqgOoHYfRBV1s727wPwIr at 0x1274d7d10> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-EPDJqgOoHYfRBV1s727wPwIr",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1698802777,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-ePmgB4qVo14GgUKdUQci6IGz",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-yt2tzTmOhKmbL4clWJ9ZmWAR",
  "hyperparameters": {
    "n_epochs": "auto"
  },
  "trained_tokens": null,
  "error": null
}