diff --git a/README.md b/README.md index 70b7c44..5c4f5c5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

- +

# ORiGAMi - Object Representation through Generative Autoregressive Modelling @@ -14,13 +14,11 @@ Please note: This tool is not officially supported or endorsed by MongoDB, Inc. ## Overview -ORiGAMi is a transformer-based Machine Learning model to directly process semi-structured data such as MongoDB documents or JSON files and make predictions from this data. +ORiGAMi is a transformer-based Machine Learning model for supervised classification from semi-structured data such as MongoDB documents or JSON files. -Typically, when working with semi-structured data in a Machine Learning context, the data needs to be flattened -into a tabular form first. This flattening can be lossy, especially in the presence of arrays and nested objects, and often requires domain expertise to extract meaningful higher-order features from the raw data. This feature extraction step is manual, slow and expensive and doesn't scale well. - -ORiGAMi is a transformer model and follows the trend of many other deep learning models by operating directly on the raw data and discovering meaningful features itself. Preprocessing is fully automated (apart from some hyper-parameters that can improve the model performance). +Typically, when working with semi-structured data in a Machine Learning context, the data needs to be flattened into a tabular format first. This flattening can be lossy, especially in the presence of arrays and nested objects, and often requires domain expertise to extract meaningful higher-order features from the raw data. This feature extraction step is manual, slow and expensive and doesn't scale well. +ORiGAMi circumvents this by directly operating on JSON data. Once a model is trained, it can be used to make predictions on any field in the dataset. ## Installation diff --git a/assets/origami_logo.jpg b/assets/origami_logo.jpg index 41cf892..63725bb 100644 Binary files a/assets/origami_logo.jpg and b/assets/origami_logo.jpg differ diff --git a/setup.cfg b/setup.cfg index 5f99911..ed4ce50 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = origami -version = 0.1.0 +version = 0.1.3 [options] packages = find: diff --git a/setup.py b/setup.py index 4ee765b..aa79a76 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,67 @@ -from distutils.core import setup +import re + +from setuptools import find_packages, setup + +# Read README for long description +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + + # Remove both image and arxiv link sections + long_description = re.sub( + r'

(?:\s*]*>|\s*\|[^|]*\|)\s*

\s*\n?', "", long_description, flags=re.MULTILINE + ) + + # Remove the Disclaimer section (from ## Disclaimer to the next ##) + long_description = re.sub(r"## Disclaimer.*?(?=## \w+)", "", long_description, flags=re.DOTALL) setup( name="origami-ml", - version="0.1.0", - packages=["origami"], - install_requires=[ - "click", + author="Thomas Rueckstiess", + author_email="thomas.rueckstiess@mongodb.com", + description="An ML classifier model to make predictions from semi-structured data.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/mongodb-labs/origami", + packages=find_packages(), + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Artificial Intelligence", ], + python_requires=">=3.10", entry_points={ "console_scripts": [ "origami = origami.cli:main", ], }, + install_requires=[ + "click>=8.1.7", + "click-option-group>=0.5.6", + "guildai>=0.9.0", + "lightgbm>=4.5.0", + "matplotlib>=3.9.2", + "mdbrtools>=0.1.1", + "numpy>=1.26.4", + "omegaconf>=2.3.0", + "openml>=0.15.1", + "pandas>=2.2.3", + "pymongo>=4.8.0", + "python-dotenv>=1.0.1", + "scikit_learn>=1.5.2", + "torch>=2.4.1", + "tqdm>=4.66.4", + "xgboost>=2.1.3", + ], + extras_require={ + "dev": [ + "jupyter>=1.1.1", + "jupyter_contrib_nbextensions>=0.7.0", + "pytest>=8.3.3", + "ruff>=0.9.3", + ], + }, )