In [98]:
import json
import types
from typing import Optional

import pandas as pd
from pandas_to_pydantic import (
    dataframe_to_pydantic,
    expand_annotation,
    get_base_fields,
    get_list_fields,
    get_root_list,
)
from pydantic import BaseModel, RootModel
from pydantic._internal._model_construction import ModelMetaclass

from pandas_to_pydantic_test.config import BOOK_DATA_PATH, DATA_PATH, LIBRARY_DATA_PATH
from pandas_to_pydantic_test.libraryTypes import Library

In [99]:
libraryData = pd.read_csv(LIBRARY_DATA_PATH)
bookData = pd.read_csv(BOOK_DATA_PATH)

In [100]:
libraryData.to_dict("records")[0]

{'LibraryID': 1,
 'LibraryName': 'City Central Library',
 'Location': 'Cityville',
 'EstablishedYear': 1950,
 'BookCollectionSize': 50000,
 'AuthorID': 1,
 'AuthorName': 'J.K. Rowling',
 'AuthorBirthdate': '1965-07-31',
 'BookID': 1,
 'Title': "Harry Potter and the Philosopher's Stone",
 'Genre': 'Fantasy',
 'PublishedYear': 1997,
 'AvailableCopies': 5}

In [170]:
class Author(BaseModel):
    AuthorID: int
    AuthorName: str
    AuthorBirthdate: str


class Book(BaseModel):
    BookID: int
    Title: str
    Genre: str
    PublishedYear: int
    AvailableCopies: int
    Author: Author

In [131]:
class ModelColumns(BaseModel):
    name: str
    id_column: str | None
    base_columns: list[str]
    list_columns: list["ModelColumns"]
    child_columns: list["ModelColumns"]

In [102]:
annotations = Library.__annotations__.copy()

In [103]:
annotations

{'LibraryID': int,
 'LibraryName': str,
 'Location': str,
 'EstablishedYear': int,
 'BookCollectionSize': int,
 'AuthorList': list[pandas_to_pydantic_test.libraryTypes.Author]}

In [184]:
def get_model_columns(
    model: ModelMetaclass,
    id_column_map: dict[str, str | None] = {},
    name: str | None = None,
) -> ModelColumns:
    if not model.__base__ == BaseModel:
        error_message = f"{model} is not a BaseModel"
        raise TypeError(error_message)

    if name is None:
        name = model.__name__
    id_column = id_column_map.get(name)
    annotations = model.__annotations__

    base_columns = []
    list_columns = []
    child_columns = []

    for field_name, field_type in annotations.items():
        if isinstance(field_type, types.GenericAlias):
            if field_type.__origin__ == list:
                list_columns.append(
                    get_model_columns(field_type.__args__[0], id_column_map, field_name)
                )
        elif isinstance(field_type, ModelMetaclass):
            if field_type.__base__ == BaseModel:
                child_columns.append(get_model_columns(field_type, id_column_map))
        else:
            base_columns.append(field_name)

    return ModelColumns(
        name=name,
        id_column=id_column,
        base_columns=base_columns,
        list_columns=list_columns,
        child_columns=child_columns,
    )

In [187]:
class GrandchildModel(BaseModel):
    grand_child_string: str
    grand_child_integer: int


class ChildModel(BaseModel):
    child_string: str
    child_integer: int
    child_list_grand_child: list[GrandchildModel]


class ParentModel(BaseModel):
    parent_string: str
    parent_integer: int
    parent_float: float
    parent_list_child: list[ChildModel]

In [189]:
get_model_columns(ParentModel).model_dump()

{'name': 'ParentModel',
 'id_column': None,
 'base_columns': ['parent_string', 'parent_integer', 'parent_float'],
 'list_columns': [{'name': 'parent_list_child',
   'id_column': None,
   'base_columns': ['child_string', 'child_integer'],
   'list_columns': [{'name': 'child_list_grand_child',
     'id_column': None,
     'base_columns': ['grand_child_string', 'grand_child_integer'],
     'list_columns': [],
     'child_columns': []}],
   'child_columns': []}],
 'child_columns': []}

In [186]:
get_model_columns(
    Library, {"Library": "LibraryID", "AuthorList": "AuthorID", "Book": "BookID"}
).model_dump()

{'name': 'Library',
 'id_column': 'LibraryID',
 'base_columns': ['LibraryID',
  'LibraryName',
  'Location',
  'EstablishedYear',
  'BookCollectionSize'],
 'list_columns': [{'name': 'AuthorList',
   'id_column': 'AuthorID',
   'base_columns': ['AuthorID', 'AuthorName', 'AuthorBirthdate'],
   'list_columns': [{'name': 'BookList',
     'id_column': None,
     'base_columns': ['BookID',
      'Title',
      'Genre',
      'PublishedYear',
      'AvailableCopies'],
     'list_columns': [],
     'child_columns': []}],
   'child_columns': []}],
 'child_columns': []}

In [167]:
def serialize_dataframe(data: pd.DataFrame, model_columns: ModelColumns) -> list[dict]:
    new_list = []

    if not model_columns.id_column:
        return data[model_columns.base_columns].to_dict(orient="records")

    for value in data[model_columns.id_column].unique():
        base_dict = {}

        slice_data = data[data[model_columns.id_column] == value]

        base_dict = {**slice_data[model_columns.base_columns].iloc[0].to_dict()}

        for list_model in model_columns.list_columns:
            base_dict[list_model.name] = serialize_dataframe(slice_data, list_model)

        for child_model in model_columns.child_columns:
            # zero index to work around returning a list
            base_dict[child_model.name] = serialize_dataframe(slice_data, child_model)[
                0
            ]

        new_list.append(base_dict)

    return new_list

In [169]:
serialize_dataframe(
    libraryData,
    get_model_columns(Library, {"Library": "LibraryID", "Author": "AuthorID"}),
)[0]

{'LibraryID': 1,
 'LibraryName': 'City Central Library',
 'Location': 'Cityville',
 'EstablishedYear': 1950,
 'BookCollectionSize': 50000,
 'Author': [{'AuthorID': 1,
   'AuthorName': 'J.K. Rowling',
   'AuthorBirthdate': '1965-07-31',
   'Book': [{'BookID': 1,
     'Title': "Harry Potter and the Philosopher's Stone",
     'Genre': 'Fantasy',
     'PublishedYear': 1997,
     'AvailableCopies': 5},
    {'BookID': 2,
     'Title': 'Harry Potter and the Chamber of Secrets',
     'Genre': 'Fantasy',
     'PublishedYear': 1998,
     'AvailableCopies': 3}]},
  {'AuthorID': 5,
   'AuthorName': 'Mark Twain',
   'AuthorBirthdate': '1835-11-30',
   'Book': [{'BookID': 10,
     'Title': 'The Adventures of Tom Sawyer',
     'Genre': 'Adventure',
     'PublishedYear': 1876,
     'AvailableCopies': 2}]}]}

In [171]:
serialize_dataframe(libraryData, get_model_columns(Book, {"Book": "BookID"}))

[{'BookID': 1,
  'Author': {'AuthorID': 1,
   'AuthorName': 'J.K. Rowling',
   'AuthorBirthdate': '1965-07-31'}},
 {'BookID': 2,
  'Author': {'AuthorID': 1,
   'AuthorName': 'J.K. Rowling',
   'AuthorBirthdate': '1965-07-31'}},
 {'BookID': 10,
  'Author': {'AuthorID': 5,
   'AuthorName': 'Mark Twain',
   'AuthorBirthdate': '1835-11-30'}},
 {'BookID': 3,
  'Author': {'AuthorID': 2,
   'AuthorName': 'George Orwell',
   'AuthorBirthdate': '1903-06-25'}},
 {'BookID': 11,
  'Author': {'AuthorID': 6,
   'AuthorName': 'J.R.R. Tolkien',
   'AuthorBirthdate': '1892-01-03'}},
 {'BookID': 5,
  'Author': {'AuthorID': 3,
   'AuthorName': 'Jane Austen',
   'AuthorBirthdate': '1775-12-16'}},
 {'BookID': 7,
  'Author': {'AuthorID': 4,
   'AuthorName': 'Agatha Christie',
   'AuthorBirthdate': '1890-09-15'}},
 {'BookID': 9,
  'Author': {'AuthorID': 5,
   'AuthorName': 'Mark Twain',
   'AuthorBirthdate': '1835-11-30'}},
 {'BookID': 4,
  'Author': {'AuthorID': 2,
   'AuthorName': 'George Orwell',
   'Auth

In [37]:
def expand_annotation(model: ModelMetaclass) -> dict:
    if not model.__base__ == BaseModel:
        error_message = f"{model} is not a BaseModel"
        raise TypeError(error_message)

    annotations = model.__annotations__.copy()

    for field_name, field_type in annotations.items():
        if isinstance(field_type, types.GenericAlias):
            # Only expanding lists
            if field_type.__origin__ == list:
                # Using lists to indicate list structure
                annotations[field_name] = [expand_annotation(field_type.__args__[0])]
        elif isinstance(field_type, ModelMetaclass):
            if field_type.__base__ == BaseModel:
                annotations[field_name] = expand_annotation(field_type)

    return annotations


def split_fields(annotation: dict) -> list[str]:
    base_fields = []
    list_fields = []
    dict_fields = []

    for annotation_name, annotation_type in annotation.items():
        if isinstance(annotation_type, list):
            list_fields.append(annotation_name)
        elif isinstance(annotation_type, dict):
            dict_fields.append(annotation_name)
        else:
            base_fields.append(annotation_name)

    return {
        "base": base_fields,
        "list": list_fields,
        "dict": dict_fields,
    }

In [38]:
model_columns = expand_annotation(Book)
fields = split_fields(expand_annotation(Book))

In [39]:
model_columns

{'BookID': int,
 'Title': str,
 'Genre': str,
 'PublishedYear': int,
 'AvailableCopies': int,
 'Author': {'AuthorID': int, 'AuthorName': str, 'AuthorBirthdate': str}}

In [47]:
fields

{'base': ['BookID', 'Title', 'Genre', 'PublishedYear', 'AvailableCopies'],
 'list': [],
 'dict': ['Author']}

In [41]:
model_columns[fields["dict"][0]]

{'AuthorID': int, 'AuthorName': str, 'AuthorBirthdate': str}

In [42]:
new_list = []
data = libraryData
id_field = fields["base"][0]

In [45]:
slice_data = data[data[id_field] == 1]

In [46]:
slice_data

Unnamed: 0,LibraryID,LibraryName,Location,EstablishedYear,BookCollectionSize,AuthorID,AuthorName,AuthorBirthdate,BookID,Title,Genre,PublishedYear,AvailableCopies
0,1,City Central Library,Cityville,1950,50000,1,J.K. Rowling,1965-07-31,1,Harry Potter and the Philosopher's Stone,Fantasy,1997,5


In [48]:
base_dict = slice_data[fields["base"]].iloc[0].to_dict()

In [49]:
base_dict

{'BookID': 1,
 'Title': "Harry Potter and the Philosopher's Stone",
 'Genre': 'Fantasy',
 'PublishedYear': 1997,
 'AvailableCopies': 5}

In [50]:
model_columns[fields["dict"][0]]

{'AuthorID': int, 'AuthorName': str, 'AuthorBirthdate': str}

In [None]:
# Assumes first field is id

for value in data[id_field].unique():
    slice_data = data[data[id_field] == value]

    base_dict = slice_data[fields["base"]].iloc[0].to_dict()

    if fields["list"]:
        # Only one list field is currently supported
        base_dict[fields["list"][0]] = serialize_dataframe(
            slice_data, model_columns[fields["list"][0]][0]
        )

In [10]:
libraryListRoot = dataframe_to_pydantic(libraryData, Library)

In [11]:
libraryListRoot.root

[Library(LibraryID=1, LibraryName='City Central Library', Location='Cityville', EstablishedYear=1950, BookCollectionSize=50000, AuthorList=[Author(AuthorID=1, AuthorName='J.K. Rowling', AuthorBirthdate='1965-07-31', BookList=[Book(BookID=1, Title="Harry Potter and the Philosopher's Stone", Genre='Fantasy', PublishedYear=1997, AvailableCopies=5), Book(BookID=2, Title='Harry Potter and the Chamber of Secrets', Genre='Fantasy', PublishedYear=1998, AvailableCopies=3)]), Author(AuthorID=5, AuthorName='Mark Twain', AuthorBirthdate='1835-11-30', BookList=[Book(BookID=10, Title='The Adventures of Tom Sawyer', Genre='Adventure', PublishedYear=1876, AvailableCopies=2)])]),
 Library(LibraryID=2, LibraryName='Greenwood Public Library', Location='Greenwood', EstablishedYear=1975, BookCollectionSize=35000, AuthorList=[Author(AuthorID=2, AuthorName='George Orwell', AuthorBirthdate='1903-06-25', BookList=[Book(BookID=3, Title='1984', Genre='Dystopian Fiction', PublishedYear=1949, AvailableCopies=7)]),

In [13]:
with open(DATA_PATH + "test.json", "w") as outfile:
    outfile.write(libraryListRoot.model_dump_json())

In [14]:
libraryJsonRoot = RootModel[list[Library]]

with open(DATA_PATH + "test.json") as f:
    jsonData = json.load(f)