Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate JSON from dict #881

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions docs/reference/json.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,11 @@ print(result)

Outlines provides [custom Pydantic types](types.md) so you do not have to write regular expressions for common types, such as phone numbers or zip codes.

## Using a JSON Schema
## Using a String of a JSON Schema

Instead of a Pydantic model you can pass a string that represents a [JSON Schema](https://json-schema.org/) specification to `generate.json`:

```python
from pydantic import BaseModel

from outlines import models
from outlines import generate

Expand All @@ -82,6 +80,35 @@ print(result)
# User(name="John", last_name="Doe", id=11)
```

## Using a Dictionary of a JSON Schema

You can also pass in dictionary that represents a [JSON Schema](https://json-schema.org/) specification to `generate.json`:

```python
from outlines import models
from outlines import generate

model = models.transformers("mistralai/Mistral-7B-v0.1")

schema_dict = {
"title": "User",
"type": "object",
"properties": {
"name": {"type": "string"},
"last_name": {"type": "string"},
"id": {"type": "integer"}
}
}

generator = generate.json(model, schema_dict)
result = generator(
"Create a user profile with the fields name, last_name and id"
)
print(result)
# User(name="John", last_name="Doe", id=11)
```


## From a function's signature

Outlines can infer the structure of the output from the signature of a function. The result is a dictionary, and can be passed directly to the function using the usual dictionary expansion syntax `**`:
Expand Down
18 changes: 12 additions & 6 deletions outlines/generate/json.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import json as pyjson
from functools import singledispatch
from typing import Callable, Optional, Union
from typing import Callable, Dict, Optional, Union

from pydantic import BaseModel

from outlines.fsm.json_schema import build_regex_from_schema, get_schema_from_signature
from outlines.generate.api import SequenceGenerator
from outlines.generate.api import SequenceGenerator, SequenceGeneratorAdapter
from outlines.models import OpenAI
from outlines.samplers import Sampler, multinomial

Expand All @@ -15,10 +15,10 @@
@singledispatch
def json(
model,
schema_object: Union[str, object, Callable],
schema_object: Union[str, object, Callable, Dict],
sampler: Sampler = multinomial(),
whitespace_pattern: Optional[str] = None,
) -> SequenceGenerator:
) -> Union[SequenceGenerator, SequenceGeneratorAdapter]:
"""
Generate structured JSON data with a `Transformer` model based on a specified JSON Schema.

Expand All @@ -39,7 +39,7 @@ def json(

Returns
-------
A `SequenceGenerator` instance that generates text constrained by the schema_object and
A `SequenceGenerator` or `SequenceGeneratorAdapter` instance that generates text constrained by the schema_object and
transforms the result if BaseModel is used.

"""
Expand All @@ -53,6 +53,11 @@ def json(
regex_str = build_regex_from_schema(schema, whitespace_pattern)
generator = regex(model, regex_str, sampler)
generator.format_sequence = lambda x: pyjson.loads(x)
elif isinstance(schema_object, Dict):
schema = pyjson.dumps(schema_object)
regex_str = build_regex_from_schema(schema, whitespace_pattern)
generator = regex(model, regex_str, sampler)
generator.format_sequence = lambda x: pyjson.loads(x)
elif isinstance(schema_object, str):
schema = schema_object
regex_str = build_regex_from_schema(schema, whitespace_pattern)
Expand All @@ -61,7 +66,8 @@ def json(
else:
raise ValueError(
f"Cannot parse schema {schema_object}. The schema must be either "
+ "a Pydantic object, a function or a string that contains the JSON "
+ "a Pydantic object, a function, a dictionary that contains the "
+ "JSON Schema specification, or a string that contains the JSON "
+ "Schema specification"
)

Expand Down
103 changes: 103 additions & 0 deletions tests/generate/test_integration_llamacpp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import re
from typing import Dict, Optional, Union

import pytest
from pydantic import BaseModel, constr
Expand Down Expand Up @@ -243,6 +244,108 @@ def test_llamacpp_json_schema(model):
assert isinstance(result["bar"], str)


def test_llamacpp_json_dict(model):
prompt = "<|im_start|>user\nOutput some JSON<|im_end|>\n<|im_start|>assistant\n"

schema_dict = {
"properties": {
"user_id": {
"anyOf": [{"type": "integer"}, {"type": "null"}],
"title": "User Id",
},
"name": {
"additionalProperties": {"type": "integer"},
"title": "Name",
"type": "object",
},
"password": {
"anyOf": [{"type": "string"}, {"type": "integer"}],
"title": "Password",
},
},
"required": ["user_id", "name", "password"],
"title": "UserPydantic",
"type": "object",
}

result = generate.json(model, schema_dict, whitespace_pattern="")(
prompt, max_tokens=100, temperature=0, seed=10
)
assert isinstance(result, dict)
assert isinstance(result["foo"], bool)
assert isinstance(result["bar"], str)


def test_json_equivalence(model):
"""Test that all methods of generating from json create the same fsm."""

# Different inputs for json generator
def user_callable(
user_id: Optional[int], name: Dict[str, int], password: Union[str, int]
):
pass

class UserPydantic(BaseModel):
user_id: Optional[int]
name: Dict[str, int]
password: Union[str, int]

user_dict = {
"properties": {
"user_id": {
"anyOf": [{"type": "integer"}, {"type": "null"}],
"title": "User Id",
},
"name": {
"additionalProperties": {"type": "integer"},
"title": "Name",
"type": "object",
},
"password": {
"anyOf": [{"type": "string"}, {"type": "integer"}],
"title": "Password",
},
},
"required": ["user_id", "name", "password"],
"title": "User",
"type": "object",
}

user_str = '{"properties": {"user_id": {"anyOf": [{"type": "integer"}, {"type": "null"}], "title": "User Id"}, "name": {"additionalProperties": {"type": "integer"}, "title": "Name", "type": "object"}, "password": {"anyOf": [{"type": "string"}, {"type": "integer"}], "title": "Password"}}, "required": ["user_id", "name", "password"], "title": "User", "type": "object"}'

# Initialize the generators
generator_callable = generate.json(model, user_callable)
generator_pydantic = generate.json(model, UserPydantic)
generator_dict = generate.json(model, user_dict)
generator_str = generate.json(model, user_str)

# Check finite state machines are the same
assert (
generator_callable.logits_processor.fsm.states_to_token_maps
== generator_pydantic.logits_processor.fsm.states_to_token_maps
== generator_dict.logits_processor.fsm.states_to_token_maps
== generator_str.logits_processor.fsm.states_to_token_maps
)
assert (
generator_callable.logits_processor.fsm.empty_token_ids
== generator_pydantic.logits_processor.fsm.empty_token_ids
== generator_dict.logits_processor.fsm.empty_token_ids
== generator_str.logits_processor.fsm.empty_token_ids
)
assert (
generator_callable.logits_processor.fsm.eos_token_id
== generator_pydantic.logits_processor.fsm.eos_token_id
== generator_dict.logits_processor.fsm.eos_token_id
== generator_str.logits_processor.fsm.eos_token_id
)
assert (
generator_callable.logits_processor.fsm.final_states
== generator_pydantic.logits_processor.fsm.final_states
== generator_dict.logits_processor.fsm.final_states
== generator_str.logits_processor.fsm.final_states
)


def test_llamacpp_cfg(model):
prompt = "<|im_start|>user\nOutput a short and valid JSON object with two keys.<|im_end|>\n><|im_start|>assistant\n"
result = generate.cfg(model, grammars.arithmetic)(prompt, seed=11)
Expand Down
109 changes: 108 additions & 1 deletion tests/generate/test_integration_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
from enum import Enum
from importlib import reload
from typing import List, Union
from typing import Dict, List, Optional, Union

import pytest
import torch
Expand Down Expand Up @@ -442,6 +442,41 @@ class Spam(BaseModel):
assert isinstance(result[1][1], BaseModel)


def test_transformers_json_dict():
model_name = "hf-internal-testing/tiny-random-GPTJForCausalLM"
model = models.transformers(model_name, device="cpu")
prompt = "Output some JSON "

schema_dict = {
"properties": {
"user_id": {
"anyOf": [{"type": "integer"}, {"type": "null"}],
"title": "User Id",
},
"name": {
"additionalProperties": {"type": "integer"},
"title": "Name",
"type": "object",
},
"password": {
"anyOf": [{"type": "string"}, {"type": "integer"}],
"title": "Password",
},
},
"required": ["user_id", "name", "password"],
"title": "UserPydantic",
"type": "object",
}

rng = torch.Generator()
rng.manual_seed(0) # make sure that `bar` is not an int

result = generate.json(model, schema_dict)(prompt, max_tokens=500, rng=rng)
assert isinstance(result, dict)
assert isinstance(result["foo"], int)
assert isinstance(result["bar"], str)


def test_transformers_json_str_enum():
model_name = "hf-internal-testing/tiny-random-GPTJForCausalLM"
model = models.transformers(model_name, device="cpu")
Expand Down Expand Up @@ -544,6 +579,78 @@ def function(foo: int, bar: List[int]):
assert isinstance(function(**sequence), int)


def test_json_equivalence():
"""Test that all methods of generating from json create the same fsm."""

# Different inputs for json generator
def user_callable(
user_id: Optional[int], name: Dict[str, int], password: Union[str, int]
):
pass

class UserPydantic(BaseModel):
user_id: Optional[int]
name: Dict[str, int]
password: Union[str, int]

user_dict = {
"properties": {
"user_id": {
"anyOf": [{"type": "integer"}, {"type": "null"}],
"title": "User Id",
},
"name": {
"additionalProperties": {"type": "integer"},
"title": "Name",
"type": "object",
},
"password": {
"anyOf": [{"type": "string"}, {"type": "integer"}],
"title": "Password",
},
},
"required": ["user_id", "name", "password"],
"title": "User",
"type": "object",
}

user_str = '{"properties": {"user_id": {"anyOf": [{"type": "integer"}, {"type": "null"}], "title": "User Id"}, "name": {"additionalProperties": {"type": "integer"}, "title": "Name", "type": "object"}, "password": {"anyOf": [{"type": "string"}, {"type": "integer"}], "title": "Password"}}, "required": ["user_id", "name", "password"], "title": "User", "type": "object"}'

# Initialize the generators
model_name = "hf-internal-testing/tiny-random-GPTJForCausalLM"
model = models.transformers(model_name, device="cpu")
generator_callable = generate.json(model, user_callable)
generator_pydantic = generate.json(model, UserPydantic)
generator_dict = generate.json(model, user_dict)
generator_str = generate.json(model, user_str)

# Check finite state machines are the same
assert (
generator_callable.fsm.states_to_token_maps
== generator_pydantic.fsm.states_to_token_maps
== generator_dict.fsm.states_to_token_maps
== generator_str.fsm.states_to_token_maps
)
assert (
generator_callable.fsm.empty_token_ids
== generator_pydantic.fsm.empty_token_ids
== generator_dict.fsm.empty_token_ids
== generator_str.fsm.empty_token_ids
)
assert (
generator_callable.fsm.eos_token_id
== generator_pydantic.fsm.eos_token_id
== generator_dict.fsm.eos_token_id
== generator_str.fsm.eos_token_id
)
assert (
generator_callable.fsm.final_states
== generator_pydantic.fsm.final_states
== generator_dict.fsm.final_states
== generator_str.fsm.final_states
)


def test_transformers_logits_vocab_size():
model_name = "hf-internal-testing/tiny-random-GPTJForCausalLM"
model = models.transformers(model_name, device="cpu")
Expand Down
Loading
Loading