diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 1eb2a167..bda967c8 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -58,6 +58,7 @@ jobs: run: | python lux/data/upload_car_data.py python lux/data/upload_aug_test_data.py + python lux/data/upload_airbnb_nyc_data.py - name: Lint check with black run: | black --target-version py37 --line-length 105 --check . diff --git a/examples/GeneralDatabase_Executor_Example.py.ipynb b/examples/GeneralDatabase_Executor_Example.py.ipynb new file mode 100644 index 00000000..5fa57ab5 --- /dev/null +++ b/examples/GeneralDatabase_Executor_Example.py.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "expected-facility", + "metadata": {}, + "source": [ + "This notebook is an example of how to use the General Database Executor in Lux. This execution backend allows users to switch what kind of queries are being used to query their database system. Here we show how to switch from using a SQL template for Postgresql to MySQL." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "helpful-liberty", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "97a93a0b783743fab041362d66d72125", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Button(description='Toggle Table/Lux', layout=Layout(bottom='6px', top='6px', width='200px'), style=ButtonStyl…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e216a8adf9584b6e8a3cc5374ae73209", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys\n", + "sys.path.insert(1, 'C:\\\\Users\\\\thyne\\\\Documents\\\\GitHub\\\\lux')\n", + "\n", + "import lux\n", + "import psycopg2\n", + "import pandas as pd\n", + "from lux import LuxSQLTable\n", + "\n", + "connection = psycopg2.connect(\"host=localhost user=postgres password=lux dbname=postgres\")\n", + "lux.config.set_SQL_connection(connection)\n", + "lux.config.read_query_template(\"postgres_query_template.txt\")\n", + "lux.config.quoted_queries = True\n", + "\n", + "sql_tbl = LuxSQLTable(table_name='car')\n", + "sql_tbl.intent = [\"Cylinders\"]\n", + "sql_tbl" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "searching-nancy", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a2c12d8447494178aa6c38fc0a4c59f6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Button(description='Toggle Table/Lux', layout=Layout(bottom='6px', top='6px', width='200px'), style=ButtonStyl…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "26b23f594155417e9fb7ff2b4695477c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sqlalchemy\n", + "import lux\n", + "from sqlalchemy.ext.declarative import declarative_base\n", + "\n", + "engine = sqlalchemy.create_engine('mysql+mysqlconnector://luxuser:lux@localhost:3306/sys',echo=False)\n", + "lux.config.set_SQL_connection(engine)\n", + "lux.config.read_query_template(\"mysql_query_template.txt\")\n", + "lux.config.quoted_queries = False\n", + "\n", + "sql_df = lux.LuxSQLTable(table_name='car')\n", + "\n", + "sql_df.intent = ['Cylinders']\n", + "sql_df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/Lux_Code_Tracing.ipynb b/examples/Lux_Code_Tracing.ipynb new file mode 100644 index 00000000..c8b0fc23 --- /dev/null +++ b/examples/Lux_Code_Tracing.ipynb @@ -0,0 +1,733 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "experienced-selling", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(1, 'C:\\\\Users\\\\thyne\\\\Documents\\\\GitHub\\\\lux')\n", + "\n", + "import lux\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "neutral-subscriber", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n", + "C:\\Users\\thyne\\Documents\\GitHub\\lux\\lux\\executor\\PandasExecutor.py:372: UserWarning:\n", + "Lux detects that the attribute 'Year' may be temporal.\n", + "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\n", + "For example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n", + "\n", + "Here is a starter template that you can use for converting the temporal fields:\n", + "\tdf['Year'] = pd.to_datetime(df['Year'], format='')\n", + "\n", + "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", + "If Year is not a temporal attribute, please use override Lux's automatically detected type:\n", + "\tdf.set_data_type({'Year':'quantitative'})\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ead29ee7d5f44de6b3fc405f349f0273", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5c7b1a8091d74855bedc3ac79e92f22b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "url = \"https://github.com/lux-org/lux-datasets/blob/master/data/car.csv?raw=true\"\n", + "my_df = pd.read_csv(url)\n", + "my_df.intent = ['Weight', 'Origin']\n", + "my_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "voluntary-emphasis", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "from lux.utils import utils\n", + "from lux.executor.PandasExecutor import PandasExecutor\n", + "import pandas\n", + "import math\n", + "ldf = 'insert your LuxDataFrame variable here'\n", + "vis = 'insert the name of your Vis object here'\n", + "vis._vis_data = ldf\n", + "SAMPLE_FLAG = lux.config.sampling\n", + "SAMPLE_START = lux.config.sampling_start\n", + "SAMPLE_CAP = lux.config.sampling_cap\n", + "SAMPLE_FRAC = 0.75\n", + "ldf._sampled = ldf\n", + "assert (vis.data is not None), \"execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)\"\n", + "filters = utils.get_filter_specs(vis._inferred_intent)\n", + "import numpy as np\n", + "x_attr = vis.get_attr_by_channel(\"x\")[0]\n", + "y_attr = vis.get_attr_by_channel(\"y\")[0]\n", + "has_color = False\n", + "groupby_attr = \"\"\n", + "measure_attr = \"\"\n", + "attr_unique_vals = []\n", + "groupby_attr = y_attr\n", + "measure_attr = x_attr\n", + "agg_func = x_attr.aggregation\n", + "attr_unique_vals = vis.data.unique_values.get(groupby_attr.attribute)\n", + "color_cardinality = 1\n", + " index_name = vis.data.index.name\n", + " index_name = \"index\"\n", + " vis._vis_data = vis.data.reset_index()\n", + " vis._vis_data = (vis.data.groupby(groupby_attr.attribute, dropna=False, history=False).count().reset_index().rename(columns={index_name: \"Record\"}))\n", + " vis._vis_data = vis.data[[groupby_attr.attribute, \"Record\"]]\n", + "result_vals = list(vis.data[groupby_attr.attribute])\n", + "vis._vis_data = vis._vis_data.dropna(subset=[measure_attr.attribute])\n", + " vis._vis_data = vis._vis_data.sort_values(by=groupby_attr.attribute, ascending=True)\n", + "vis._vis_data = vis._vis_data.reset_index()\n", + "vis._vis_data = vis._vis_data.drop(columns=\"index\")\n", + "\n", + "vis\n" + ] + } + ], + "source": [ + "my_vis = my_df.recommendation['Generalize'][0]\n", + "print(my_vis.to_code(language = \"python\"))" + ] + }, + { + "cell_type": "markdown", + "id": "protecting-transcript", + "metadata": {}, + "source": [ + "Once Lux has given us the code used to generate a particular Vis, we can copy and paste the code into a new Jupyter notebook cell. Before running the cell, be sure to populate the `ldf` and `vis` variables with the names of your original LuxDataFrame/LuxSQLTable and Vis objects." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "surprising-dutch", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dbcf9b127b23497992ea6200e673a5f0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "LuxWidget(current_vis={'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}, 'axis': {'labelCo…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from lux.utils import utils\n", + "from lux.executor.PandasExecutor import PandasExecutor\n", + "import pandas\n", + "import math\n", + "ldf = my_df\n", + "vis = my_vis\n", + "vis._vis_data = ldf\n", + "SAMPLE_FLAG = lux.config.sampling\n", + "SAMPLE_START = lux.config.sampling_start\n", + "SAMPLE_CAP = lux.config.sampling_cap\n", + "SAMPLE_FRAC = 0.75\n", + "ldf._sampled = ldf\n", + "assert (vis.data is not None), \"execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)\"\n", + "filters = utils.get_filter_specs(vis._inferred_intent)\n", + "import numpy as np\n", + "x_attr = vis.get_attr_by_channel(\"x\")[0]\n", + "y_attr = vis.get_attr_by_channel(\"y\")[0]\n", + "has_color = False\n", + "groupby_attr = \"\"\n", + "measure_attr = \"\"\n", + "attr_unique_vals = []\n", + "groupby_attr = y_attr\n", + "measure_attr = x_attr\n", + "agg_func = x_attr.aggregation\n", + "attr_unique_vals = vis.data.unique_values.get(groupby_attr.attribute)\n", + "color_cardinality = 1\n", + "index_name = vis.data.index.name\n", + "index_name = \"index\"\n", + "vis._vis_data = vis.data.reset_index()\n", + "vis._vis_data = (vis.data.groupby(groupby_attr.attribute, dropna=False, history=False).count().reset_index().rename(columns={index_name: \"Record\"}))\n", + "vis._vis_data = vis.data[[groupby_attr.attribute, \"Record\"]]\n", + "result_vals = list(vis.data[groupby_attr.attribute])\n", + "vis._vis_data = vis._vis_data.dropna(subset=[measure_attr.attribute])\n", + "vis._vis_data = vis._vis_data.sort_values(by=groupby_attr.attribute, ascending=True)\n", + "vis._vis_data = vis._vis_data.reset_index()\n", + "vis._vis_data = vis._vis_data.drop(columns=\"index\")\n", + "\n", + "vis" + ] + }, + { + "cell_type": "markdown", + "id": "vulnerable-trade", + "metadata": {}, + "source": [ + "The code tracing also works when using the SQLExecutor. You can also access the specific SQL query used by the executor by specifying `language = 'SQL'` in the `to_code()` function." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "unsigned-balance", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a3cf748eae1649be8f6a43a5f6365699", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Button(description='Toggle Table/Lux', layout=Layout(bottom='6px', top='6px', width='200px'), style=ButtonStyl…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dbbcf0f06b0445b5860f037d8a093ee4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import psycopg2\n", + "from lux import LuxSQLTable\n", + "\n", + "connection = psycopg2.connect(\"host=localhost user=postgres password=lux dbname=postgres\")\n", + "lux.config.set_SQL_connection(connection)\n", + "lux.config.set_executor_type(\"SQL\")\n", + "\n", + "sql_tbl = LuxSQLTable(table_name='car')\n", + "sql_tbl.intent = [\"Cylinders\"]\n", + "sql_tbl" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "identified-replica", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'SQLExecutor'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lux.config.executor.name" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "typical-exemption", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT \"Year\", \"Cylinders\", COUNT(\"Year\") FROM car WHERE \"Year\" IS NOT NULL AND \"Cylinders\" IS NOT NULL GROUP BY \"Year\", \"Cylinders\"\n", + "None\n" + ] + } + ], + "source": [ + "my_vis = sql_tbl.recommendation['Enhance'][0]\n", + "print(print(my_vis.to_code(language = \"SQL\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "likely-choice", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "from lux.utils import utils\n", + "from lux.executor.SQLExecutor import SQLExecutor\n", + "import pandas\n", + "import math\n", + "tbl = 'insert your LuxSQLTable variable here'\n", + "view = 'insert the name of your Vis object here'\n", + "x_attr = view.get_attr_by_channel(\"x\")[0]\n", + "y_attr = view.get_attr_by_channel(\"y\")[0]\n", + "has_color = False\n", + "groupby_attr = \"\"\n", + "measure_attr = \"\"\n", + "groupby_attr = x_attr\n", + "measure_attr = y_attr\n", + "agg_func = y_attr.aggregation\n", + "attr_unique_vals = tbl.unique_values[groupby_attr.attribute]\n", + "color_attr = view.get_attr_by_channel(\"color\")[0]\n", + "color_attr_vals = tbl.unique_values[color_attr.attribute]\n", + "color_cardinality = len(color_attr_vals)\n", + "has_color = True\n", + " where_clause, filterVars = SQLExecutor.execute_filter(view)\n", + "filters = utils.get_filter_specs(view._inferred_intent)\n", + " length_query = pandas.read_sql(\"SELECT COUNT(*) as length FROM {} {}\".format(tbl.table_name, where_clause),lux.config.SQLconnection,)\n", + " count_query = 'SELECT \"{}\", \"{}\", COUNT(\"{}\") FROM {} {} GROUP BY \"{}\", \"{}\"'.format(groupby_attr.attribute,color_attr.attribute,groupby_attr.attribute,tbl.table_name,where_clause,groupby_attr.attribute,color_attr.attribute,)\n", + " view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection)\n", + " view._vis_data = view._vis_data.rename(columns={\"count\": \"Record\"})\n", + " view._vis_data = utils.pandas_to_lux(view._vis_data)\n", + " view._query = count_query\n", + "result_vals = list(view._vis_data[groupby_attr.attribute])\n", + " res_color_combi_vals = []\n", + " result_color_vals = list(view._vis_data[color_attr.attribute])\n", + " for i in range(0, len(result_vals)):\n", + " res_color_combi_vals.append([result_vals[i], result_color_vals[i]])\n", + " N_unique_vals = len(attr_unique_vals)\n", + " columns = view._vis_data.columns\n", + " df = pandas.DataFrame({columns[0]: attr_unique_vals * color_cardinality,columns[1]: pandas.Series(color_attr_vals).repeat(N_unique_vals),})\n", + " view._vis_data = view._vis_data.merge(df,on=[columns[0], columns[1]],how=\"right\",suffixes=[\"\", \"_right\"],)\n", + " for col in columns[2:]:\n", + " view._vis_data[col] = view._vis_data[col].fillna(0) # Triggers __setitem__\n", + " assert len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals * len(color_attr_vals), f\"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`.\"\n", + " view._vis_data = view._vis_data.iloc[:, :3] # Keep only the three relevant columns not the *_right columns resulting from merge\n", + "view._vis_data = view._vis_data.sort_values(by=groupby_attr.attribute, ascending=True)\n", + "view._vis_data = view._vis_data.reset_index()\n", + "view._vis_data = view._vis_data.drop(columns=\"index\")\n", + "\n", + "view\n" + ] + } + ], + "source": [ + "print(my_vis.to_code(language = \"python\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "running-laser", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a901329aa3dc46d4a642b7c6838c2879", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "LuxWidget(current_vis={'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}, 'axis': {'labelCo…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from lux.utils import utils\n", + "from lux.executor.SQLExecutor import SQLExecutor\n", + "import pandas\n", + "import math\n", + "tbl = sql_tbl\n", + "view = my_vis\n", + "x_attr = view.get_attr_by_channel(\"x\")[0]\n", + "y_attr = view.get_attr_by_channel(\"y\")[0]\n", + "has_color = False\n", + "groupby_attr = \"\"\n", + "measure_attr = \"\"\n", + "groupby_attr = x_attr\n", + "measure_attr = y_attr\n", + "agg_func = y_attr.aggregation\n", + "attr_unique_vals = tbl.unique_values[groupby_attr.attribute]\n", + "color_attr = view.get_attr_by_channel(\"color\")[0]\n", + "color_attr_vals = tbl.unique_values[color_attr.attribute]\n", + "color_cardinality = len(color_attr_vals)\n", + "has_color = True\n", + "where_clause, filterVars = SQLExecutor.execute_filter(view)\n", + "filters = utils.get_filter_specs(view._inferred_intent)\n", + "length_query = pandas.read_sql(\"SELECT COUNT(*) as length FROM {} {}\".format(tbl.table_name, where_clause),lux.config.SQLconnection,)\n", + "count_query = 'SELECT \"{}\", \"{}\", COUNT(\"{}\") FROM {} {} GROUP BY \"{}\", \"{}\"'.format(groupby_attr.attribute,color_attr.attribute,groupby_attr.attribute,tbl.table_name,where_clause,groupby_attr.attribute,color_attr.attribute,)\n", + "view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection)\n", + "view._vis_data = view._vis_data.rename(columns={\"count\": \"Record\"})\n", + "view._vis_data = utils.pandas_to_lux(view._vis_data)\n", + "view._query = count_query\n", + "result_vals = list(view._vis_data[groupby_attr.attribute])\n", + "res_color_combi_vals = []\n", + "result_color_vals = list(view._vis_data[color_attr.attribute])\n", + "for i in range(0, len(result_vals)):\n", + " res_color_combi_vals.append([result_vals[i], result_color_vals[i]])\n", + "N_unique_vals = len(attr_unique_vals)\n", + "columns = view._vis_data.columns\n", + "df = pandas.DataFrame({columns[0]: attr_unique_vals * color_cardinality,columns[1]: pandas.Series(color_attr_vals).repeat(N_unique_vals),})\n", + "view._vis_data = view._vis_data.merge(df,on=[columns[0], columns[1]],how=\"right\",suffixes=[\"\", \"_right\"],)\n", + "for col in columns[2:]:\n", + " view._vis_data[col] = view._vis_data[col].fillna(0) # Triggers __setitem__\n", + "assert len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals * len(color_attr_vals), f\"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`.\"\n", + "view._vis_data = view._vis_data.iloc[:, :3] # Keep only the three relevant columns not the *_right columns resulting from merge\n", + "view._vis_data = view._vis_data.sort_values(by=groupby_attr.attribute, ascending=True)\n", + "view._vis_data = view._vis_data.reset_index()\n", + "view._vis_data = view._vis_data.drop(columns=\"index\")\n", + "\n", + "view" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/mysql_query_template.txt b/examples/mysql_query_template.txt new file mode 100644 index 00000000..b70be61e --- /dev/null +++ b/examples/mysql_query_template.txt @@ -0,0 +1,19 @@ +preview_query:SELECT * from {table_name} LIMIT {num_rows} +length_query:SELECT COUNT(*) as length FROM {table_name} {where_clause} +sample_query:SELECT * FROM {table_name} {where_clause} LIMIT {num_rows} +scatter_query:SELECT {columns} FROM {table_name} {where_clause} +colored_barchart_counts:SELECT {groupby_attr}, {color_attr}, COUNT({groupby_attr}) as count FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_average:SELECT {groupby_attr}, {color_attr}, AVG({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_sum:SELECT {groupby_attr}, {color_attr}, SUM({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_max:SELECT {groupby_attr}, {color_attr}, MAX({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +barchart_counts:SELECT {groupby_attr}, COUNT({groupby_attr}) as count FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_average:SELECT {groupby_attr}, AVG({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_sum:SELECT {groupby_attr}, SUM({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_max:SELECT {groupby_attr}, MAX({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +histogram_counts:SELECT width_bucket, count(width_bucket) as count from (SELECT ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets GROUP BY width_bucket order by width_bucket +heatmap_counts:SELECT width_bucket1, width_bucket2, count(*) as count FROM (SELECT ({bucket_cases1}) as width_bucket1, ({bucket_cases2}) as width_bucket2 FROM {table_name} {where_clause}) as labeled_data GROUP BY width_bucket1, width_bucket2 +table_attributes_query:SELECT COLUMN_NAME as column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{table_name}' +min_max_query:SELECT MIN({attribute}) as min, MAX({attribute}) as max FROM {table_name} +cardinality_query:SELECT COUNT(Distinct({attribute})) as count FROM {table_name} WHERE {attribute} IS NOT NULL +unique_query:SELECT Distinct({attribute}) FROM {table_name} WHERE {attribute} IS NOT NULL +datatype_query:SELECT DATA_TYPE as data_type FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}' AND COLUMN_NAME = '{attribute}' \ No newline at end of file diff --git a/examples/postgres_query_template.txt b/examples/postgres_query_template.txt new file mode 100644 index 00000000..f004a169 --- /dev/null +++ b/examples/postgres_query_template.txt @@ -0,0 +1,19 @@ +preview_query:SELECT * from {table_name} LIMIT {num_rows} +length_query:SELECT COUNT(1) as length FROM {table_name} {where_clause} +sample_query:SELECT * FROM {table_name} {where_clause} ORDER BY random() LIMIT {num_rows} +scatter_query:SELECT {columns} FROM {table_name} {where_clause} +colored_barchart_counts:SELECT "{groupby_attr}", "{color_attr}", COUNT("{groupby_attr}") FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +colored_barchart_average:SELECT "{groupby_attr}", "{color_attr}", AVG("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +colored_barchart_sum:SELECT "{groupby_attr}", "{color_attr}", SUM("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +colored_barchart_max:SELECT "{groupby_attr}", "{color_attr}", MAX("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +barchart_counts:SELECT "{groupby_attr}", COUNT("{groupby_attr}") FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +barchart_average:SELECT "{groupby_attr}", AVG("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +barchart_sum:SELECT "{groupby_attr}", SUM("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +barchart_max:SELECT "{groupby_attr}", MAX("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +histogram_counts:SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket(CAST ("{bin_attribute}" AS FLOAT), '{upper_edges}') FROM {table_name} {where_clause}) as Buckets GROUP BY width_bucket ORDER BY width_bucket +heatmap_counts:SELECT width_bucket1, width_bucket2, count(*) FROM (SELECT width_bucket(CAST ("{x_attribute}" AS FLOAT), '{x_upper_edges_string}') as width_bucket1, width_bucket(CAST ("{y_attribute}" AS FLOAT), '{y_upper_edges_string}') as width_bucket2 FROM {table_name} {where_clause}) as foo GROUP BY width_bucket1, width_bucket2 +table_attributes_query:SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{table_name}' +min_max_query:SELECT MIN("{attribute}") as min, MAX("{attribute}") as max FROM {table_name} +cardinality_query:SELECT Count(Distinct("{attribute}")) FROM {table_name} WHERE "{attribute}" IS NOT NULL +unique_query:SELECT Distinct("{attribute}") FROM {table_name} WHERE "{attribute}" IS NOT NULL +datatype_query:SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}' AND COLUMN_NAME = '{attribute}' \ No newline at end of file diff --git a/examples/query_template.txt b/examples/query_template.txt new file mode 100644 index 00000000..c04c746a --- /dev/null +++ b/examples/query_template.txt @@ -0,0 +1,23 @@ +############################################################################## +#########################Example Query Template ######################### +# Details on query function see: https://readthedocs... ## +############################################################################## +preview_query: +length_query: +sample_query: +scatter_query: +colored_barchart_counts:SELECT {groupby_attr}, {color_attr}, COUNT({groupby_attr}) as count FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_average:SELECT {groupby_attr}, {color_attr}, AVG({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_sum:SELECT {groupby_attr}, {color_attr}, SUM({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_max:SELECT {groupby_attr}, {color_attr}, MAX({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +barchart_counts:SELECT {groupby_attr}, COUNT({groupby_attr}) as count FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_average:SELECT {groupby_attr}, AVG({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_sum:SELECT {groupby_attr}, SUM({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_max:SELECT {groupby_attr}, MAX({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +histogram_counts:SELECT width_bucket, count(width_bucket) as count from (SELECT ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets GROUP BY width_bucket order by width_bucket +heatmap_counts:SELECT width_bucket1, width_bucket2, count(*) as count FROM (SELECT ({bucket_cases1}) as width_bucket1, ({bucket_cases2}) as width_bucket2 FROM {table_name} {where_clause}) as labeled_data GROUP BY width_bucket1, width_bucket2 +table_attributes_query:SELECT COLUMN_NAME as column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{table_name}' +min_max_query:SELECT MIN({attribute}) as min, MAX({attribute}) as max FROM {table_name} +cardinality_query:SELECT COUNT(Distinct({attribute})) as count FROM {table_name} WHERE {attribute} IS NOT NULL +unique_query:SELECT Distinct({attribute}) FROM {table_name} WHERE {attribute} IS NOT NULL +datatype_query:SELECT DATA_TYPE as data_type FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}' AND COLUMN_NAME = '{attribute}' \ No newline at end of file diff --git a/lux/__init__.py b/lux/__init__.py index 7d865410..aef0cc87 100644 --- a/lux/__init__.py +++ b/lux/__init__.py @@ -16,6 +16,8 @@ from lux.vis.Clause import Clause from lux.core.frame import LuxDataFrame from lux.core.sqltable import LuxSQLTable +from lux.core.joinedsqltable import JoinedSQLTable +from lux.utils.tracing_utils import LuxTracer from ._version import __version__, version_info from lux._config import config from lux._config.config import warning_format diff --git a/lux/_config/config.py b/lux/_config/config.py index 3ec8ffcb..4998a881 100644 --- a/lux/_config/config.py +++ b/lux/_config/config.py @@ -6,6 +6,9 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Union import lux import warnings +from lux.utils.tracing_utils import LuxTracer +import os +from lux._config.template import postgres_template, mysql_template RegisteredOption = namedtuple("RegisteredOption", "name action display_condition args") @@ -32,6 +35,10 @@ def __init__(self): self._pandas_fallback = True self._interestingness_fallback = True self.heatmap_bin_size = 40 + self.tracer_relevant_lines = [] + self.tracer = LuxTracer() + self.query_templates = {} + self.handle_quotes = True ##################################### #### Optimization Configurations #### ##################################### @@ -39,6 +46,7 @@ def __init__(self): self._sampling_cap = 1000000 self._sampling_flag = True self._heatmap_flag = True + self._heatmap_start = 5000 self.lazy_maintain = True self.early_pruning = True self.early_pruning_sample_cap = 30000 @@ -377,11 +385,28 @@ def set_SQL_connection(self, connection): self.set_executor_type("SQL") self.SQLconnection = connection + def read_query_template(self, query_template): + from lux.executor.SQLExecutor import SQLExecutor + + query_dict = {} + if type(query_template) is str: + for line in query_template.split("\n"): + (key, val) = line.split(":") + query_dict[key] = val.strip() + else: + with open(query_file) as f: + for line in f: + (key, val) = line.split(":") + query_dict[key] = val.strip() + self.query_templates = query_dict + self.executor = SQLExecutor() + def set_executor_type(self, exe): if exe == "SQL": from lux.executor.SQLExecutor import SQLExecutor self.executor = SQLExecutor() + self.read_query_template(postgres_template) elif exe == "Pandas": from lux.executor.PandasExecutor import PandasExecutor diff --git a/lux/_config/mysql_query_template.txt b/lux/_config/mysql_query_template.txt new file mode 100644 index 00000000..b70be61e --- /dev/null +++ b/lux/_config/mysql_query_template.txt @@ -0,0 +1,19 @@ +preview_query:SELECT * from {table_name} LIMIT {num_rows} +length_query:SELECT COUNT(*) as length FROM {table_name} {where_clause} +sample_query:SELECT * FROM {table_name} {where_clause} LIMIT {num_rows} +scatter_query:SELECT {columns} FROM {table_name} {where_clause} +colored_barchart_counts:SELECT {groupby_attr}, {color_attr}, COUNT({groupby_attr}) as count FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_average:SELECT {groupby_attr}, {color_attr}, AVG({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_sum:SELECT {groupby_attr}, {color_attr}, SUM({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_max:SELECT {groupby_attr}, {color_attr}, MAX({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +barchart_counts:SELECT {groupby_attr}, COUNT({groupby_attr}) as count FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_average:SELECT {groupby_attr}, AVG({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_sum:SELECT {groupby_attr}, SUM({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_max:SELECT {groupby_attr}, MAX({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +histogram_counts:SELECT width_bucket, count(width_bucket) as count from (SELECT ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets GROUP BY width_bucket order by width_bucket +heatmap_counts:SELECT width_bucket1, width_bucket2, count(*) as count FROM (SELECT ({bucket_cases1}) as width_bucket1, ({bucket_cases2}) as width_bucket2 FROM {table_name} {where_clause}) as labeled_data GROUP BY width_bucket1, width_bucket2 +table_attributes_query:SELECT COLUMN_NAME as column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{table_name}' +min_max_query:SELECT MIN({attribute}) as min, MAX({attribute}) as max FROM {table_name} +cardinality_query:SELECT COUNT(Distinct({attribute})) as count FROM {table_name} WHERE {attribute} IS NOT NULL +unique_query:SELECT Distinct({attribute}) FROM {table_name} WHERE {attribute} IS NOT NULL +datatype_query:SELECT DATA_TYPE as data_type FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}' AND COLUMN_NAME = '{attribute}' \ No newline at end of file diff --git a/lux/_config/postgres_query_template.txt b/lux/_config/postgres_query_template.txt new file mode 100644 index 00000000..f004a169 --- /dev/null +++ b/lux/_config/postgres_query_template.txt @@ -0,0 +1,19 @@ +preview_query:SELECT * from {table_name} LIMIT {num_rows} +length_query:SELECT COUNT(1) as length FROM {table_name} {where_clause} +sample_query:SELECT * FROM {table_name} {where_clause} ORDER BY random() LIMIT {num_rows} +scatter_query:SELECT {columns} FROM {table_name} {where_clause} +colored_barchart_counts:SELECT "{groupby_attr}", "{color_attr}", COUNT("{groupby_attr}") FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +colored_barchart_average:SELECT "{groupby_attr}", "{color_attr}", AVG("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +colored_barchart_sum:SELECT "{groupby_attr}", "{color_attr}", SUM("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +colored_barchart_max:SELECT "{groupby_attr}", "{color_attr}", MAX("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +barchart_counts:SELECT "{groupby_attr}", COUNT("{groupby_attr}") FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +barchart_average:SELECT "{groupby_attr}", AVG("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +barchart_sum:SELECT "{groupby_attr}", SUM("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +barchart_max:SELECT "{groupby_attr}", MAX("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +histogram_counts:SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket(CAST ("{bin_attribute}" AS FLOAT), '{upper_edges}') FROM {table_name} {where_clause}) as Buckets GROUP BY width_bucket ORDER BY width_bucket +heatmap_counts:SELECT width_bucket1, width_bucket2, count(*) FROM (SELECT width_bucket(CAST ("{x_attribute}" AS FLOAT), '{x_upper_edges_string}') as width_bucket1, width_bucket(CAST ("{y_attribute}" AS FLOAT), '{y_upper_edges_string}') as width_bucket2 FROM {table_name} {where_clause}) as foo GROUP BY width_bucket1, width_bucket2 +table_attributes_query:SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{table_name}' +min_max_query:SELECT MIN("{attribute}") as min, MAX("{attribute}") as max FROM {table_name} +cardinality_query:SELECT Count(Distinct("{attribute}")) FROM {table_name} WHERE "{attribute}" IS NOT NULL +unique_query:SELECT Distinct("{attribute}") FROM {table_name} WHERE "{attribute}" IS NOT NULL +datatype_query:SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}' AND COLUMN_NAME = '{attribute}' \ No newline at end of file diff --git a/lux/_config/template.py b/lux/_config/template.py new file mode 100644 index 00000000..98068144 --- /dev/null +++ b/lux/_config/template.py @@ -0,0 +1,39 @@ +postgres_template = """preview_query:SELECT * from {table_name} LIMIT {num_rows} +length_query:SELECT COUNT(1) as length FROM {table_name} {where_clause} +sample_query:SELECT * FROM {table_name} {where_clause} ORDER BY random() LIMIT {num_rows} +scatter_query:SELECT {columns} FROM {table_name} {where_clause} +colored_barchart_counts:SELECT "{groupby_attr}", "{color_attr}", COUNT("{groupby_attr}") FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +colored_barchart_average:SELECT "{groupby_attr}", "{color_attr}", AVG("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +colored_barchart_sum:SELECT "{groupby_attr}", "{color_attr}", SUM("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +colored_barchart_max:SELECT "{groupby_attr}", "{color_attr}", MAX("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}", "{color_attr}" +barchart_counts:SELECT "{groupby_attr}", COUNT("{groupby_attr}") FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +barchart_average:SELECT "{groupby_attr}", AVG("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +barchart_sum:SELECT "{groupby_attr}", SUM("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +barchart_max:SELECT "{groupby_attr}", MAX("{measure_attr}") as "{measure_attr}" FROM {table_name} {where_clause} GROUP BY "{groupby_attr}" +histogram_counts:SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket(CAST ("{bin_attribute}" AS FLOAT), '{upper_edges}') FROM {table_name} {where_clause}) as Buckets GROUP BY width_bucket ORDER BY width_bucket +heatmap_counts:SELECT width_bucket1, width_bucket2, count(*) FROM (SELECT width_bucket(CAST ("{x_attribute}" AS FLOAT), '{x_upper_edges_string}') as width_bucket1, width_bucket(CAST ("{y_attribute}" AS FLOAT), '{y_upper_edges_string}') as width_bucket2 FROM {table_name} {where_clause}) as foo GROUP BY width_bucket1, width_bucket2 +table_attributes_query:SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{table_name}' +min_max_query:SELECT MIN("{attribute}") as min, MAX("{attribute}") as max FROM {table_name} +cardinality_query:SELECT Count(Distinct("{attribute}")) FROM {table_name} WHERE "{attribute}" IS NOT NULL +unique_query:SELECT Distinct("{attribute}") FROM {table_name} WHERE "{attribute}" IS NOT NULL +datatype_query:SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}' AND COLUMN_NAME = '{attribute}'""" + +mysql_template = """preview_query:SELECT * from {table_name} LIMIT {num_rows} +length_query:SELECT COUNT(*) as length FROM {table_name} {where_clause} +sample_query:SELECT * FROM {table_name} {where_clause} LIMIT {num_rows} +scatter_query:SELECT {columns} FROM {table_name} {where_clause} +colored_barchart_counts:SELECT {groupby_attr}, {color_attr}, COUNT({groupby_attr}) as count FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_average:SELECT {groupby_attr}, {color_attr}, AVG({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_sum:SELECT {groupby_attr}, {color_attr}, SUM({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +colored_barchart_max:SELECT {groupby_attr}, {color_attr}, MAX({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr}, {color_attr} +barchart_counts:SELECT {groupby_attr}, COUNT({groupby_attr}) as count FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_average:SELECT {groupby_attr}, AVG({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_sum:SELECT {groupby_attr}, SUM({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +barchart_max:SELECT {groupby_attr}, MAX({measure_attr}) as {measure_attr} FROM {table_name} {where_clause} GROUP BY {groupby_attr} +histogram_counts:SELECT width_bucket, count(width_bucket) as count from (SELECT ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets GROUP BY width_bucket order by width_bucket +heatmap_counts:SELECT width_bucket1, width_bucket2, count(*) as count FROM (SELECT ({bucket_cases1}) as width_bucket1, ({bucket_cases2}) as width_bucket2 FROM {table_name} {where_clause}) as labeled_data GROUP BY width_bucket1, width_bucket2 +table_attributes_query:SELECT COLUMN_NAME as column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{table_name}' +min_max_query:SELECT MIN({attribute}) as min, MAX({attribute}) as max FROM {table_name} +cardinality_query:SELECT COUNT(Distinct({attribute})) as count FROM {table_name} WHERE {attribute} IS NOT NULL +unique_query:SELECT Distinct({attribute}) FROM {table_name} WHERE {attribute} IS NOT NULL +datatype_query:SELECT DATA_TYPE as data_type FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}' AND COLUMN_NAME = '{attribute}'""" diff --git a/lux/core/frame.py b/lux/core/frame.py index 2a1359ab..e8bf29aa 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -76,7 +76,7 @@ def __init__(self, *args, **kw): else: from lux.executor.SQLExecutor import SQLExecutor - lux.config.executor = SQLExecutor() + # lux.config.executor = SQLExecutor() self._sampled = None self._approx_sample = None @@ -127,15 +127,18 @@ def compute_metadata(self) -> None: self._infer_structure() self._metadata_fresh = True - def maintain_metadata(self) -> None: + def maintain_metadata(self): """ Maintain dataset metadata and statistics (Compute only if needed) """ - is_sql_tbl = lux.config.executor.name == "SQLExecutor" + is_sql_tbl = lux.config.executor.name != "PandasExecutor" + if lux.config.SQLconnection != "" and is_sql_tbl: from lux.executor.SQLExecutor import SQLExecutor - lux.config.executor = SQLExecutor() + # lux.config.executor = SQLExecutor() + + # Check that metadata has not yet been computed if lux.config.lazy_maintain: # Check that metadata has not yet been computed if not hasattr(self, "_metadata_fresh") or not self._metadata_fresh: @@ -195,7 +198,8 @@ def _infer_structure(self): # If the dataframe is very small and the index column is not a range index, then it is likely that this is an aggregated data is_multi_index_flag = self.index.nlevels != 1 not_int_index_flag = not pd.api.types.is_integer_dtype(self.index) - is_sql_tbl = lux.config.executor.name == "SQLExecutor" + + is_sql_tbl = lux.config.executor.name != "PandasExecutor" small_df_flag = len(self) < 100 and is_sql_tbl if self.pre_aggregated == None: diff --git a/lux/core/joinedsqltable.py b/lux/core/joinedsqltable.py new file mode 100644 index 00000000..87bd8b94 --- /dev/null +++ b/lux/core/joinedsqltable.py @@ -0,0 +1,229 @@ +# Copyright 2019-2020 The Lux Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +from lux.core.series import LuxSeries +from lux.vis.Clause import Clause +from lux.vis.Vis import Vis +from lux.vis.VisList import VisList +from lux.history.history import History +from lux.utils.date_utils import is_datetime_series +from lux.utils.message import Message +from lux.utils.utils import check_import_lux_widget +from typing import Dict, Union, List, Callable + +# from lux.executor.Executor import * +import warnings +import traceback +import lux + + +class JoinedSQLTable(lux.LuxSQLTable): + """ + A subclass of Lux.LuxDataFrame that houses other variables and functions for generating visual recommendations. Does not support normal pandas functionality. + """ + + # MUST register here for new properties!! + _metadata = [ + "_intent", + "_inferred_intent", + "_data_type", + "unique_values", + "cardinality", + "_rec_info", + "_min_max", + "_current_vis", + "_widget", + "_recommendation", + "_prev", + "_history", + "_saved_export", + "_sampled", + "_toggle_pandas_display", + "_message", + "_pandas_only", + "pre_aggregated", + "_type_override", + "joins", + "using_view", + ] + + def __init__(self, *args, joins=[], **kw): + super(JoinedSQLTable, self).__init__(*args, **kw) + from lux.executor.SQLExecutor import SQLExecutor + + lux.config.executor = SQLExecutor() + # self._metadata.joins = [] + tables = self.extract_tables(joins) + if len(tables) > 4: + warnings.warn( + f"\nPlease provide a maximum of 4 (Four) unique tables to ensure optimal performance.", + stacklevel=2, + ) + view_name = self.create_view(tables, joins) + self._length = 0 + if view_name != "": + self.set_SQL_table(view_name) + # self._metadata.using_view = True + warnings.formatwarning = lux.warning_format + + def len(self): + return self._length + + def extract_tables(self, joins): + tables = set() + for condition in joins: + lhs = condition[0 : condition.index("=")].strip() + rhs = condition[condition.index("=") + 1 :].strip() + table1 = lhs[0 : lhs.index(".")].strip() + table2 = rhs[0 : rhs.index(".")].strip() + tables.add(table1) + tables.add(table2) + return tables + + def create_view(self, tables, joins): + import psycopg2 + + dbc = lux.config.SQLconnection.cursor() + import time + + curr_time = str(int(time.time())) + viewname = "lux_view_" + curr_time + table_entry = "" + for idx, table in enumerate(tables, 1): + table_entry += table + if idx < len(tables): + table_entry += ", " + + condition_entry = "" + for idx, join in enumerate(joins, 1): + condition_entry += join + if idx < len(joins): + condition_entry += " AND " + try: + # # s = "CREATE VIEW {} AS SELECT * FROM cars_join cj JOIN cars_power_join cpj using (id)".format(viewname) + s = "CREATE VIEW {} AS SELECT * FROM {} where {}".format( + viewname, table_entry, condition_entry + ) + # lux.config.executor.create_view(self) + dbc.execute(s) + lux.config.SQLconnection.commit() + except Exception as error: + print("Exception : " + str(error)) + viewname = "" + dbc.close() + return viewname + + def _ipython_display_(self): + from IPython.display import HTML, Markdown, display + from IPython.display import clear_output + import ipywidgets as widgets + + try: + if self._pandas_only: + display(self.display_pandas()) + self._pandas_only = False + if not self.index.nlevels >= 2 or self.columns.nlevels >= 2: + self.maintain_metadata() + + if self._intent != [] and (not hasattr(self, "_compiled") or not self._compiled): + from lux.processor.Compiler import Compiler + + self.current_vis = Compiler.compile_intent(self, self._intent) + + if lux.config.default_display == "lux": + self._toggle_pandas_display = False + else: + self._toggle_pandas_display = True + + # df_to_display.maintain_recs() # compute the recommendations (TODO: This can be rendered in another thread in the background to populate self._widget) + self.maintain_recs() + + # Observers(callback_function, listen_to_this_variable) + self._widget.observe(self.remove_deleted_recs, names="deletedIndices") + self._widget.observe(self.set_intent_on_click, names="selectedIntentIndex") + + button = widgets.Button( + description="Toggle Table/Lux", + layout=widgets.Layout(width="200px", top="6px", bottom="6px"), + ) + self.output = widgets.Output() + self._sampled = lux.config.executor.execute_preview(self) + display(button, self.output) + + def on_button_clicked(b): + with self.output: + if b: + self._toggle_pandas_display = not self._toggle_pandas_display + clear_output() + + # create connection string to display + connect_str = self.table_name + connection_type = str(type(lux.config.SQLconnection)) + if "psycopg2.extensions.connection" in connection_type: + connection_dsn = lux.config.SQLconnection.get_dsn_parameters() + host_name = connection_dsn["host"] + host_port = connection_dsn["port"] + dbname = connection_dsn["dbname"] + connect_str = host_name + ":" + host_port + "/" + dbname + + elif "sqlalchemy.engine.base.Engine" in connection_type: + db_connection = str(lux.config.SQLconnection) + db_start = db_connection.index("@") + 1 + db_end = len(db_connection) - 1 + connect_str = db_connection[db_start:db_end] + + if self._toggle_pandas_display: + notification = "Here is a preview of the **{}** database table: **{}**".format( + self.table_name, connect_str + ) + display(Markdown(notification), self._sampled.display_pandas()) + else: + # b.layout.display = "none" + display(self._widget) + # b.layout.display = "inline-block" + + button.on_click(on_button_clicked) + on_button_clicked(None) + + except (KeyboardInterrupt, SystemExit): + raise + except Exception: + if lux.config.pandas_fallback: + warnings.warn( + "\nUnexpected error in rendering Lux widget and recommendations. " + "Falling back to Pandas display.\n" + "Please report the following issue on Github: https://github.com/lux-org/lux/issues \n", + stacklevel=2, + ) + warnings.warn(traceback.format_exc()) + display(self.display_pandas()) + else: + raise + + # Overridden Pandas Functions + def head(self, n: int = 5): + return + + def tail(self, n: int = 5): + return + + def info(self, *args, **kwargs): + return + + def describe(self, *args, **kwargs): + return + + def groupby(self, *args, **kwargs): + return diff --git a/lux/core/sqltable.py b/lux/core/sqltable.py index 5535dc14..2e54f8d0 100644 --- a/lux/core/sqltable.py +++ b/lux/core/sqltable.py @@ -61,9 +61,11 @@ class LuxSQLTable(lux.LuxDataFrame): def __init__(self, *args, table_name="", **kw): super(LuxSQLTable, self).__init__(*args, **kw) - from lux.executor.SQLExecutor import SQLExecutor - lux.config.executor = SQLExecutor() + if lux.config.executor.name != "GeneralDatabaseExecutor": + from lux.executor.SQLExecutor import SQLExecutor + + lux.config.executor = SQLExecutor() self._length = 0 self._setup_done = False @@ -97,6 +99,25 @@ def set_SQL_table(self, t_name): stacklevel=2, ) + def maintain_metadata(self): + # Check that metadata has not yet been computed + if not hasattr(self, "_metadata_fresh") or not self._metadata_fresh: + # only compute metadata information if the dataframe is non-empty + lux.config.executor.compute_dataset_metadata(self) + self._infer_structure() + self._metadata_fresh = True + + def expire_metadata(self): + """ + Expire all saved metadata to trigger a recomputation the next time the data is required. + """ + # self._metadata_fresh = False + # self._data_type = None + # self.unique_values = None + # self.cardinality = None + # self._min_max = None + # self.pre_aggregated = None + def _ipython_display_(self): from IPython.display import HTML, Markdown, display from IPython.display import clear_output diff --git a/lux/data/upload_airbnb_nyc_data.py b/lux/data/upload_airbnb_nyc_data.py new file mode 100644 index 00000000..59d5846d --- /dev/null +++ b/lux/data/upload_airbnb_nyc_data.py @@ -0,0 +1,10 @@ +import pandas as pd +import psycopg2 +import csv + +from sqlalchemy import create_engine + + +data = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv") +engine = create_engine("postgresql://postgres:lux@localhost:5432") +data.to_sql(name="airbnb", con=engine, if_exists="replace", index=False) diff --git a/lux/executor/Executor.py b/lux/executor/Executor.py index e954d913..f967def1 100644 --- a/lux/executor/Executor.py +++ b/lux/executor/Executor.py @@ -71,6 +71,10 @@ def compute_stats(self): def compute_data_type(self): return NotImplemented + @staticmethod + def compute_dataset_metadata(self, ldf): + return NotImplemented + def mapping(self, rmap): group_map = {} if rmap == {}: diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 9ff2df5b..30dfdf4c 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -22,6 +22,7 @@ from lux.utils.utils import check_import_lux_widget, check_if_id_like, is_numeric_nan_column import warnings import lux +from lux.utils.tracing_utils import LuxTracer class PandasExecutor(Executor): @@ -111,9 +112,11 @@ def execute(vislist: VisList, ldf: LuxDataFrame, approx=False): ------- None """ + PandasExecutor.execute_sampling(ldf) for vis in vislist: # The vis data starts off being original or sampled dataframe + vis._source = ldf vis._vis_data = ldf._sampled # Approximating vis for early pruning if approx: @@ -128,7 +131,7 @@ def execute(vislist: VisList, ldf: LuxDataFrame, approx=False): if clause.attribute != "Record": attributes.add(clause.attribute) # TODO: Add some type of cap size on Nrows ? - vis._vis_data = vis.data[list(attributes)] + vis._vis_data = vis._vis_data[list(attributes)] if vis.mark == "bar" or vis.mark == "line" or vis.mark == "geographical": PandasExecutor.execute_aggregate(vis, isFiltered=filter_executed) @@ -201,33 +204,17 @@ def execute_aggregate(vis: Vis, isFiltered=True): # if color is specified, need to group by groupby_attr and color_attr if has_color: - vis._vis_data = ( - vis.data.groupby( - [groupby_attr.attribute, color_attr.attribute], dropna=False, history=False - ) - .count() - .reset_index() - .rename(columns={index_name: "Record"}) - ) + vis._vis_data = (vis.data.groupby([groupby_attr.attribute, color_attr.attribute], dropna=False, history=False).count().reset_index().rename(columns={index_name: "Record"})) vis._vis_data = vis.data[[groupby_attr.attribute, color_attr.attribute, "Record"]] else: - vis._vis_data = ( - vis.data.groupby(groupby_attr.attribute, dropna=False, history=False) - .count() - .reset_index() - .rename(columns={index_name: "Record"}) - ) + vis._vis_data = (vis.data.groupby(groupby_attr.attribute, dropna=False, history=False).count().reset_index().rename(columns={index_name: "Record"})) vis._vis_data = vis.data[[groupby_attr.attribute, "Record"]] else: # if color is specified, need to group by groupby_attr and color_attr if has_color: - groupby_result = vis.data.groupby( - [groupby_attr.attribute, color_attr.attribute], dropna=False, history=False - ) + groupby_result = vis.data.groupby([groupby_attr.attribute, color_attr.attribute], dropna=False, history=False) else: - groupby_result = vis.data.groupby( - groupby_attr.attribute, dropna=False, history=False - ) + groupby_result = vis.data.groupby(groupby_attr.attribute, dropna=False, history=False) groupby_result = groupby_result.agg(agg_func) intermediate = groupby_result.reset_index() vis._vis_data = intermediate.__finalize__(vis.data) @@ -245,39 +232,22 @@ def execute_aggregate(vis: Vis, isFiltered=True): if len(result_vals) != N_unique_vals * color_cardinality: columns = vis.data.columns if has_color: - df = pd.DataFrame( - { - columns[0]: attr_unique_vals * color_cardinality, - columns[1]: pd.Series(color_attr_vals).repeat(N_unique_vals), - } - ) - vis._vis_data = vis.data.merge( - df, - on=[columns[0], columns[1]], - how="right", - suffixes=["", "_right"], - ) + df = pd.DataFrame({columns[0]: attr_unique_vals * color_cardinality,columns[1]: pd.Series(color_attr_vals).repeat(N_unique_vals),}) + vis._vis_data = vis.data.merge(df,on=[columns[0], columns[1]],how="right",suffixes=["", "_right"],) for col in columns[2:]: - vis.data[col] = vis.data[col].fillna(0) # Triggers __setitem__ - assert len(list(vis.data[groupby_attr.attribute])) == N_unique_vals * len( - color_attr_vals - ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." - + # Triggers __setitem__ + vis.data[col] = vis.data[col].fillna(0) + assert len(list(vis.data[groupby_attr.attribute])) == N_unique_vals * len(color_attr_vals), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." # Keep only the three relevant columns not the *_right columns resulting from merge - vis._vis_data = vis.data.iloc[:, :3] + vis._vis_data = vis.data[[groupby_attr.attribute, color_attr.attribute, measure_attr.attribute]] else: df = pd.DataFrame({columns[0]: attr_unique_vals}) - - vis._vis_data = vis.data.merge( - df, on=columns[0], how="right", suffixes=["", "_right"] - ) + vis._vis_data = vis.data.merge(df, on=columns[0], how="right", suffixes=["", "_right"]) for col in columns[1:]: vis.data[col] = vis.data[col].fillna(0) - assert ( - len(list(vis.data[groupby_attr.attribute])) == N_unique_vals - ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." + assert (len(list(vis.data[groupby_attr.attribute])) == N_unique_vals), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." vis._vis_data = vis._vis_data.dropna(subset=[measure_attr.attribute]) try: @@ -311,7 +281,7 @@ def execute_binning(ldf: LuxDataFrame, vis: Vis): """ import numpy as np - bin_attribute = list(filter(lambda x: x.bin_size != 0, vis._inferred_intent))[0] + bin_attribute = [x for x in vis._inferred_intent if x.bin_size != 0][0] bin_attr = bin_attribute.attribute series = vis.data[bin_attr] @@ -344,17 +314,13 @@ def execute_filter(vis: Vis) -> bool: bool Boolean flag indicating if any filter was applied """ - assert ( - vis.data is not None - ), "execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)" + assert (vis.data is not None), "execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)" filters = utils.get_filter_specs(vis._inferred_intent) if filters: # TODO: Need to handle OR logic for filter in filters: - vis._vis_data = PandasExecutor.apply_filter( - vis.data, filter.attribute, filter.filter_op, filter.value - ) + vis._vis_data = PandasExecutor.apply_filter(vis.data, filter.attribute, filter.filter_op, filter.value) return True else: return False @@ -428,16 +394,10 @@ def execute_2D_binning(vis: Vis) -> None: if color_attr.data_type == "nominal": # Compute mode and count. Mode aggregates each cell by taking the majority vote for the category variable. In cases where there is ties across categories, pick the first item (.iat[0]) result = groups.agg( - [ - ("count", "count"), - (color_attr.attribute, lambda x: pd.Series.mode(x).iat[0]), - ] - ).reset_index() + [("count", "count"),(color_attr.attribute, lambda x: pd.Series.mode(x).iat[0]),]).reset_index() elif color_attr.data_type == "quantitative" or color_attr.data_type == "temporal": # Compute the average of all values in the bin - result = groups.agg( - [("count", "count"), (color_attr.attribute, "mean")] - ).reset_index() + result = groups.agg([("count", "count"), (color_attr.attribute, "mean")]).reset_index() result = result.dropna() else: groups = vis._vis_data.groupby(["xBin", "yBin"], history=False)[x_attr] @@ -595,13 +555,8 @@ def compute_stats(self, ldf: LuxDataFrame): ldf.unique_values[attribute_repr] = list(ldf[attribute].unique()) ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr]) - if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype( - ldf.dtypes[attribute] - ): - ldf._min_max[attribute_repr] = ( - ldf[attribute].min(), - ldf[attribute].max(), - ) + if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype(ldf.dtypes[attribute]): + ldf._min_max[attribute_repr] = (ldf[attribute].min(),ldf[attribute].max(),) if not pd.api.types.is_integer_dtype(ldf.index): index_column_name = ldf.index.name diff --git a/lux/executor/SQLExecutor.py b/lux/executor/SQLExecutor.py index c8917c69..452460d1 100644 --- a/lux/executor/SQLExecutor.py +++ b/lux/executor/SQLExecutor.py @@ -26,9 +26,8 @@ def __repr__(self): @staticmethod def execute_preview(tbl: LuxSQLTable, preview_size=5): - output = pandas.read_sql( - "SELECT * from {} LIMIT {}".format(tbl.table_name, preview_size), lux.config.SQLconnection - ) + preview_query = lux.config.query_templates['preview_query'] + output = pandas.read_sql(preview_query.format(table_name = tbl.table_name, num_rows = preview_size), lux.config.SQLconnection) return output @staticmethod @@ -38,14 +37,10 @@ def execute_sampling(tbl: LuxSQLTable): SAMPLE_CAP = lux.config.sampling_cap SAMPLE_FRAC = 0.2 - length_query = pandas.read_sql( - "SELECT COUNT(*) as length FROM {}".format(tbl.table_name), - lux.config.SQLconnection, - ) + length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = ""),lux.config.SQLconnection,) limit = int(list(length_query["length"])[0]) * SAMPLE_FRAC - tbl._sampled = pandas.read_sql( - "SELECT * from {} LIMIT {}".format(tbl.table_name, str(limit)), lux.config.SQLconnection - ) + sample_query = lux.config.query_templates['sample_query'].format(table_name = tbl.table_name, where_clause = "", num_rows = str(int(limit))) + tbl._sampled = pandas.read_sql(sample_query, lux.config.SQLconnection) @staticmethod def execute(view_collection: VisList, tbl: LuxSQLTable, approx: bool = False): @@ -55,28 +50,27 @@ def execute(view_collection: VisList, tbl: LuxSQLTable, approx: bool = False): 2) Query necessary data, applying appropriate aggregation for the chart type 3) populates vis' data with a DataFrame with relevant results """ - for view in view_collection: # choose execution method depending on vis mark type # when mark is empty, deal with lazy execution by filling the data with a small sample of the dataframe + if view.mark == "": SQLExecutor.execute_sampling(tbl) view._vis_data = tbl._sampled if view.mark == "scatter": where_clause, filterVars = SQLExecutor.execute_filter(view) - length_query = pandas.read_sql( - "SELECT COUNT(1) as length FROM {} {}".format(tbl.table_name, where_clause), - lux.config.SQLconnection, - ) + length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = where_clause),lux.config.SQLconnection,) view_data_length = list(length_query["length"])[0] - if len(view.get_attr_by_channel("color")) == 1 or view_data_length < 5000: + if view_data_length >= lux.config._heatmap_start: # NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable---------------- has_color = True SQLExecutor.execute_scatter(view, tbl) else: view._mark = "heatmap" SQLExecutor.execute_2D_binning(view, tbl) + elif view.mark == "heatmap": + SQLExecutor.execute_2D_binning(view, tbl) elif view.mark == "bar" or view.mark == "line": SQLExecutor.execute_aggregate(view, tbl) elif view.mark == "histogram": @@ -111,29 +105,32 @@ def execute_scatter(view: Vis, tbl: LuxSQLTable): attributes.add(clause.attribute) where_clause, filterVars = SQLExecutor.execute_filter(view) - length_query = pandas.read_sql( - "SELECT COUNT(1) as length FROM {} {}".format(tbl.table_name, where_clause), - lux.config.SQLconnection, - ) + length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = where_clause),lux.config.SQLconnection,) def add_quotes(var_name): return '"' + var_name + '"' required_variables = attributes | set(filterVars) - required_variables = map(add_quotes, required_variables) - required_variables = ",".join(required_variables) - row_count = list( - pandas.read_sql( - f"SELECT COUNT(*) FROM {tbl.table_name} {where_clause}", - lux.config.SQLconnection, - )["count"] - )[0] + if lux.config.handle_quotes: + required_variables = map(add_quotes, required_variables) + required_variables_str = ",".join(required_variables) + row_count = list(pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = where_clause),lux.config.SQLconnection,)["length"])[0] if row_count > lux.config.sampling_cap: - query = f"SELECT {required_variables} FROM {tbl.table_name} {where_clause} ORDER BY random() LIMIT 10000" + query = lux.config.query_templates['sample_query'].format(columns = required_variables_str, table_name = tbl.table_name, where_clause = where_clause, num_rows = 10000) + #query = f"SELECT {required_variables} FROM {tbl.table_name} {where_clause} ORDER BY random() LIMIT 10000" else: - query = "SELECT {} FROM {} {}".format(required_variables, tbl.table_name, where_clause) + query = lux.config.query_templates['scatter_query'].format(columns = required_variables_str, table_name = tbl.table_name, where_clause = where_clause) data = pandas.read_sql(query, lux.config.SQLconnection) + + if len(attributes | set(filterVars)) == 2: + assert(len(data.columns) == 2) + else: + assert(len(data.columns) == 3) view._vis_data = utils.pandas_to_lux(data) + view._query = query + # view._vis_data.length = list(length_query["length"])[0] + + tbl._message.add_unique(f"Large scatterplots detected: Lux is automatically binning scatterplots to heatmaps.",priority=98,) @staticmethod def execute_aggregate(view: Vis, tbl: LuxSQLTable, isFiltered=True): @@ -182,128 +179,62 @@ def execute_aggregate(view: Vis, tbl: LuxSQLTable, isFiltered=True): if measure_attr.attribute == "Record": where_clause, filterVars = SQLExecutor.execute_filter(view) - length_query = pandas.read_sql( - "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause), - lux.config.SQLconnection, - ) + length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = where_clause),lux.config.SQLconnection,) # generates query for colored barchart case if has_color: - count_query = 'SELECT "{}", "{}", COUNT("{}") FROM {} {} GROUP BY "{}", "{}"'.format( - groupby_attr.attribute, - color_attr.attribute, - groupby_attr.attribute, - tbl.table_name, - where_clause, - groupby_attr.attribute, - color_attr.attribute, - ) + count_query = lux.config.query_templates['colored_barchart_counts'].format(groupby_attr = groupby_attr.attribute, color_attr = color_attr.attribute, table_name = tbl.table_name, where_clause = where_clause,) view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection) + assert((len(view._vis_data.columns) == 3) & ("count" in view._vis_data.columns)) view._vis_data = view._vis_data.rename(columns={"count": "Record"}) view._vis_data = utils.pandas_to_lux(view._vis_data) # generates query for normal barchart case else: - count_query = 'SELECT "{}", COUNT("{}") FROM {} {} GROUP BY "{}"'.format( - groupby_attr.attribute, - groupby_attr.attribute, - tbl.table_name, - where_clause, - groupby_attr.attribute, - ) + count_query = lux.config.query_templates['barchart_counts'].format(groupby_attr = groupby_attr.attribute, table_name = tbl.table_name, where_clause = where_clause,) view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection) + assert((len(view._vis_data.columns) == 2) & ("count" in view._vis_data.columns)) view._vis_data = view._vis_data.rename(columns={"count": "Record"}) view._vis_data = utils.pandas_to_lux(view._vis_data) + view._query = count_query # view._vis_data.length = list(length_query["length"])[0] # aggregate barchart case, need aggregate data (mean, sum, max) for each group else: where_clause, filterVars = SQLExecutor.execute_filter(view) - - length_query = pandas.read_sql( - "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause), - lux.config.SQLconnection, - ) + length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = where_clause),lux.config.SQLconnection,) # generates query for colored barchart case if has_color: if agg_func == "mean": - agg_query = ( - 'SELECT "{}", "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format( - groupby_attr.attribute, - color_attr.attribute, - measure_attr.attribute, - measure_attr.attribute, - tbl.table_name, - where_clause, - groupby_attr.attribute, - color_attr.attribute, - ) - ) + agg_query = (lux.config.query_templates['colored_barchart_average'].format(groupby_attr = groupby_attr.attribute,color_attr = color_attr.attribute,measure_attr = measure_attr.attribute,table_name = tbl.table_name,where_clause = where_clause,)) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) - + assert((len(view._vis_data.columns) == 3) & (measure_attr.attribute in view._vis_data.columns)) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "sum": - agg_query = ( - 'SELECT "{}", "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format( - groupby_attr.attribute, - color_attr.attribute, - measure_attr.attribute, - measure_attr.attribute, - tbl.table_name, - where_clause, - groupby_attr.attribute, - color_attr.attribute, - ) - ) + agg_query = (lux.config.query_templates['colored_barchart_sum'].format(groupby_attr = groupby_attr.attribute,color_attr = color_attr.attribute,measure_attr = measure_attr.attribute,table_name = tbl.table_name,where_clause = where_clause,)) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) + assert((len(view._vis_data.columns) == 3) & (measure_attr.attribute in view._vis_data.columns)) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "max": - agg_query = ( - 'SELECT "{}", "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format( - groupby_attr.attribute, - color_attr.attribute, - measure_attr.attribute, - measure_attr.attribute, - tbl.table_name, - where_clause, - groupby_attr.attribute, - color_attr.attribute, - ) - ) + agg_query = (lux.config.query_templates['colored_barchart_max'].format(groupby_attr = groupby_attr.attribute,color_attr = color_attr.attribute,measure_attr = measure_attr.attribute,table_name = tbl.table_name,where_clause = where_clause,)) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) + assert((len(view._vis_data.columns) == 3) & (measure_attr.attribute in view._vis_data.columns)) view._vis_data = utils.pandas_to_lux(view._vis_data) # generates query for normal barchart case else: if agg_func == "mean": - agg_query = 'SELECT "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}"'.format( - groupby_attr.attribute, - measure_attr.attribute, - measure_attr.attribute, - tbl.table_name, - where_clause, - groupby_attr.attribute, - ) + agg_query = lux.config.query_templates['barchart_average'].format(groupby_attr = groupby_attr.attribute,measure_attr = measure_attr.attribute,table_name = tbl.table_name,where_clause = where_clause,) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) + assert((len(view._vis_data.columns) == 2) & (measure_attr.attribute in view._vis_data.columns)) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "sum": - agg_query = 'SELECT "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}"'.format( - groupby_attr.attribute, - measure_attr.attribute, - measure_attr.attribute, - tbl.table_name, - where_clause, - groupby_attr.attribute, - ) + agg_query = lux.config.query_templates['barchart_sum'].format(groupby_attr = groupby_attr.attribute,measure_attr = measure_attr.attribute,table_name = tbl.table_name,where_clause = where_clause,) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) + assert((len(view._vis_data.columns) == 2) & (measure_attr.attribute in view._vis_data.columns)) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "max": - agg_query = 'SELECT "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}"'.format( - groupby_attr.attribute, - measure_attr.attribute, - measure_attr.attribute, - tbl.table_name, - where_clause, - groupby_attr.attribute, - ) + agg_query = lux.config.query_templates['barchart_max'].format(groupby_attr = groupby_attr.attribute,measure_attr = measure_attr.attribute,table_name = tbl.table_name,where_clause = where_clause,) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) + assert((len(view._vis_data.columns) == 2) & (measure_attr.attribute in view._vis_data.columns)) view._vis_data = utils.pandas_to_lux(view._vis_data) + view._query = agg_query result_vals = list(view._vis_data[groupby_attr.attribute]) # create existing group by attribute combinations if color is specified # this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them @@ -318,38 +249,22 @@ def execute_aggregate(view: Vis, tbl: LuxSQLTable, isFiltered=True): if len(result_vals) != N_unique_vals * color_cardinality: columns = view._vis_data.columns if has_color: - df = pandas.DataFrame( - { - columns[0]: attr_unique_vals * color_cardinality, - columns[1]: pandas.Series(color_attr_vals).repeat(N_unique_vals), - } - ) - view._vis_data = view._vis_data.merge( - df, - on=[columns[0], columns[1]], - how="right", - suffixes=["", "_right"], - ) + df = pandas.DataFrame({columns[0]: attr_unique_vals * color_cardinality,columns[1]: pandas.Series(color_attr_vals).repeat(N_unique_vals),}) + view._vis_data = view._vis_data.merge(df,on=[columns[0], columns[1]],how="right",suffixes=["", "_right"],) for col in columns[2:]: - view._vis_data[col] = view._vis_data[col].fillna(0) # Triggers __setitem__ - assert len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals * len( - color_attr_vals - ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." - view._vis_data = view._vis_data.iloc[ - :, :3 - ] # Keep only the three relevant columns not the *_right columns resulting from merge + # Triggers __setitem__ + view._vis_data[col] = view._vis_data[col].fillna(0) + assert len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals * len(color_attr_vals), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." + # Keep only the three relevant columns not the *_right columns resulting from merge + view._vis_data = view._vis_data.iloc[:, :3] else: df = pandas.DataFrame({columns[0]: attr_unique_vals}) - view._vis_data = view._vis_data.merge( - df, on=columns[0], how="right", suffixes=["", "_right"] - ) + view._vis_data = view._vis_data.merge(df, on=columns[0], how="right", suffixes=["", "_right"]) for col in columns[1:]: view._vis_data[col] = view._vis_data[col].fillna(0) - assert ( - len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals - ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." + assert (len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." view._vis_data = view._vis_data.sort_values(by=groupby_attr.attribute, ascending=True) view._vis_data = view._vis_data.reset_index() view._vis_data = view._vis_data.drop(columns="index") @@ -381,11 +296,8 @@ def execute_binning(view: Vis, tbl: LuxSQLTable): # get filters if available where_clause, filterVars = SQLExecutor.execute_filter(view) - length_query = pandas.read_sql( - "SELECT COUNT(1) as length FROM {} {}".format(tbl.table_name, where_clause), - lux.config.SQLconnection, - ) - # need to calculate the bin edges before querying for the relevant data + length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = where_clause),lux.config.SQLconnection,) + bin_width = (attr_max - attr_min) / num_bins upper_edges = [] for e in range(1, num_bins): @@ -396,14 +308,30 @@ def execute_binning(view: Vis, tbl: LuxSQLTable): upper_edges.append(str(curr_edge)) upper_edges = ",".join(upper_edges) view_filter, filter_vars = SQLExecutor.execute_filter(view) - bin_count_query = "SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket(CAST (\"{}\" AS FLOAT), '{}') FROM {} {}) as Buckets GROUP BY width_bucket ORDER BY width_bucket".format( - bin_attribute.attribute, - "{" + upper_edges + "}", - tbl.table_name, - where_clause, - ) + + #handling for non postgres case + if "cases" in lux.config.query_templates['histogram_counts']: + bucket_edges = [attr_min] + for e in range(1, num_bins): + curr_edge = attr_min + e * bin_width + bucket_edges.append(str(curr_edge)) + bucket_edges.append(attr_max) + + when_line = "WHEN {column} BETWEEN {lower_edge} AND {upper_edge} THEN {label}" + when_lines = "CASE " + for i in range(1, len(bucket_edges)): + when_lines = when_lines + when_line.format(column = bin_attribute.attribute, lower_edge = bucket_edges[i-1], upper_edge = bucket_edges[i], label = str(i-1)) + " " + + when_lines = when_lines + "end" + + #hist_query = "select width_bucket, count(width_bucket) as count from (select ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets group by width_bucket order by width_bucket" + bin_count_query = lux.config.query_templates['histogram_counts'].format(bucket_cases = when_lines, table_name = tbl.table_name, where_clause = where_clause) + # need to calculate the bin edges before querying for the relevant data + else: + bin_count_query = lux.config.query_templates['histogram_counts'].format(bin_attribute = bin_attribute.attribute,upper_edges = "{" + upper_edges + "}",table_name = tbl.table_name,where_clause = where_clause,) bin_count_data = pandas.read_sql(bin_count_query, lux.config.SQLconnection) + assert((len(bin_count_data.columns) ==2) & (set(['width_bucket', 'count']).issubset(bin_count_data.columns))) if not bin_count_data["width_bucket"].isnull().values.any(): # np.histogram breaks if data contain NaN @@ -414,15 +342,9 @@ def execute_binning(view: Vis, tbl: LuxSQLTable): bin_centers = np.array([math.ceil((attr_min + attr_min + bin_width) / 2)]) else: bin_centers = np.array([(attr_min + attr_min + bin_width) / 2]) - bin_centers = np.append( - bin_centers, - np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0), - ) + bin_centers = np.append(bin_centers,np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0),) if attr_type == int: - bin_centers = np.append( - bin_centers, - math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2), - ) + bin_centers = np.append(bin_centers,math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2),) else: bin_centers = np.append(bin_centers, (upper_edges[len(upper_edges) - 1] + attr_max) / 2) @@ -430,13 +352,9 @@ def execute_binning(view: Vis, tbl: LuxSQLTable): bucket_lables = bin_count_data["width_bucket"].unique() for i in range(0, len(bin_centers)): if i not in bucket_lables: - bin_count_data = bin_count_data.append( - pandas.DataFrame([[i, 0]], columns=bin_count_data.columns) - ) - view._vis_data = pandas.DataFrame( - np.array([bin_centers, list(bin_count_data["count"])]).T, - columns=[bin_attribute.attribute, "Number of Records"], - ) + bin_count_data = bin_count_data.append(pandas.DataFrame([[i, 0]], columns=bin_count_data.columns)) + + view._vis_data = pandas.DataFrame(np.array([bin_centers, list(bin_count_data["count"])]).T,columns=[bin_attribute.attribute, "Number of Records"],) view._vis_data = utils.pandas_to_lux(view.data) # view._vis_data.length = list(length_query["length"])[0] @@ -483,33 +401,44 @@ def execute_2D_binning(view: Vis, tbl: LuxSQLTable): x_upper_edges_string = ",".join(x_upper_edges_string) y_upper_edges_string = ",".join(y_upper_edges) - bin_count_query = "SELECT width_bucket1, width_bucket2, count(*) FROM (SELECT width_bucket(CAST (\"{}\" AS FLOAT), '{}') as width_bucket1, width_bucket(CAST (\"{}\" AS FLOAT), '{}') as width_bucket2 FROM {} {}) as foo GROUP BY width_bucket1, width_bucket2".format( - x_attribute.attribute, - "{" + x_upper_edges_string + "}", - y_attribute.attribute, - "{" + y_upper_edges_string + "}", - tbl.table_name, - where_clause, - ) + if "cases" in lux.config.query_templates['histogram_counts']: + x_bucket_edges = [x_attr_min] + y_bucket_edges = [y_attr_min] + for e in range(1, num_bins): + x_curr_edge = x_attr_min + e * x_bin_width + x_bucket_edges.append(str(x_curr_edge)) + + y_curr_edge = y_attr_min + e * y_bin_width + y_bucket_edges.append(str(y_curr_edge)) + x_bucket_edges.append(x_attr_max) + y_bucket_edges.append(y_attr_max) + + when_line = "WHEN {column} BETWEEN {lower_edge} AND {upper_edge} THEN {label}" + x_when_lines = "CASE " + y_when_lines = "CASE " + for i in range(1, len(x_bucket_edges)): + x_when_lines = x_when_lines + when_line.format(column = x_attribute.attribute, lower_edge = x_bucket_edges[i-1], upper_edge = x_bucket_edges[i], label = str(i-1)) + " " + y_when_lines = y_when_lines + when_line.format(column = y_attribute.attribute, lower_edge = y_bucket_edges[i-1], upper_edge = y_bucket_edges[i], label = str(i-1)) + " " + x_when_lines = x_when_lines + "end" + y_when_lines = y_when_lines + "end" + + #hist_query = "select width_bucket, count(width_bucket) as count from (select ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets group by width_bucket order by width_bucket" + bin_count_query = lux.config.query_templates['heatmap_counts'].format(bucket_cases1 = x_when_lines, bucket_cases2 = y_when_lines, table_name = tbl.table_name, where_clause = where_clause) + + else: + bin_count_query = lux.config.query_templates['heatmap_counts'].format(x_attribute = x_attribute.attribute,x_upper_edges_string = "{" + x_upper_edges_string + "}",y_attribute = y_attribute.attribute,y_upper_edges_string = "{" + y_upper_edges_string + "}",table_name = tbl.table_name,where_clause = where_clause,) # data = pandas.read_sql(bin_count_query, lux.config.SQLconnection) data = pandas.read_sql(bin_count_query, lux.config.SQLconnection) + assert((len(data.columns) == 3) & (set(['width_bucket1', 'width_bucket2', 'count']).issubset(data.columns))) # data = data[data["width_bucket1"] != num_bins - 1] # data = data[data["width_bucket2"] != num_bins - 1] if len(data) > 0: - data["xBinStart"] = data.apply( - lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]) - x_bin_width, axis=1 - ) - data["xBinEnd"] = data.apply( - lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]), axis=1 - ) - data["yBinStart"] = data.apply( - lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]) - y_bin_width, axis=1 - ) - data["yBinEnd"] = data.apply( - lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]), axis=1 - ) + data["xBinStart"] = data.apply(lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]) - x_bin_width, axis=1) + data["xBinEnd"] = data.apply(lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]), axis=1) + data["yBinStart"] = data.apply(lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]) - y_bin_width, axis=1) + data["yBinEnd"] = data.apply(lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]), axis=1) view._vis_data = utils.pandas_to_lux(data) @staticmethod @@ -545,13 +474,22 @@ def create_where_clause(filter_specs, view=""): where_clause.append("AND") curr_value = str(filters[f].value) curr_value = curr_value.replace("'", "''") - where_clause.extend( - [ - '"' + str(filters[f].attribute) + '"', - str(filters[f].filter_op), - "'" + curr_value + "'", - ] - ) + if lux.config.handle_quotes == True: + where_clause.extend( + [ + '"' + str(filters[f].attribute) + '"', + str(filters[f].filter_op), + "'" + curr_value + "'", + ] + ) + else: + where_clause.extend( + [ + str(filters[f].attribute), + str(filters[f].filter_op), + "'" + curr_value + "'", + ] + ) if filters[f].attribute not in filter_vars: filter_vars.append(filters[f].attribute) if view != "": @@ -565,12 +503,20 @@ def create_where_clause(filter_specs, view=""): where_clause.append("WHERE") else: where_clause.append("AND") - where_clause.extend( - [ - '"' + str(a.attribute) + '"', - "IS NOT NULL", - ] - ) + if lux.config.handle_quotes == True: + where_clause.extend( + [ + '"' + str(a.attribute) + '"', + "IS NOT NULL", + ] + ) + else: + where_clause.extend( + [ + str(a.attribute), + "IS NOT NULL", + ] + ) if where_clause == []: return ("", []) @@ -582,10 +528,7 @@ def get_filtered_size(filter_specs, tbl): clause_info = SQLExecutor.create_where_clause(filter_specs=filter_specs, view="") where_clause = clause_info[0] filter_intents = filter_specs[0] - filtered_length = pandas.read_sql( - "SELECT COUNT(1) as length FROM {} {}".format(tbl.table_name, where_clause), - lux.config.SQLconnection, - ) + filtered_length = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = where_clause),lux.config.SQLconnection,) return list(filtered_length["length"])[0] ####################################################### @@ -632,8 +575,8 @@ def get_SQL_attributes(self, tbl: LuxSQLTable): table_name = tbl.table_name[self.table_name.index(".") + 1 :] else: table_name = tbl.table_name - attr_query = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{}'".format( - table_name + attr_query = lux.config.query_templates['table_attributes_query'].format( + table_name = table_name, ) attributes = list(pandas.read_sql(attr_query, lux.config.SQLconnection)["column_name"]) for attr in attributes: @@ -657,25 +600,14 @@ def compute_stats(self, tbl: LuxSQLTable): # precompute statistics tbl.unique_values = {} tbl._min_max = {} - length_query = pandas.read_sql( - "SELECT COUNT(1) as length FROM {}".format(tbl.table_name), - lux.config.SQLconnection, - ) + length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = ""),lux.config.SQLconnection,) tbl._length = list(length_query["length"])[0] self.get_unique_values(tbl) for attribute in tbl.columns: if tbl._data_type[attribute] == "quantitative": - min_max_query = pandas.read_sql( - 'SELECT MIN("{}") as min, MAX("{}") as max FROM {}'.format( - attribute, attribute, tbl.table_name - ), - lux.config.SQLconnection, - ) - tbl._min_max[attribute] = ( - list(min_max_query["min"])[0], - list(min_max_query["max"])[0], - ) + min_max_query = pandas.read_sql(lux.config.query_templates['min_max_query'].format(attribute = attribute, table_name = tbl.table_name),lux.config.SQLconnection,) + tbl._min_max[attribute] = (list(min_max_query["min"])[0],list(min_max_query["max"])[0],) def get_cardinality(self, tbl: LuxSQLTable): """ @@ -693,13 +625,8 @@ def get_cardinality(self, tbl: LuxSQLTable): """ cardinality = {} for attr in list(tbl.columns): - card_query = 'SELECT Count(Distinct("{}")) FROM {} WHERE "{}" IS NOT NULL'.format( - attr, tbl.table_name, attr - ) - card_data = pandas.read_sql( - card_query, - lux.config.SQLconnection, - ) + card_query = lux.config.query_templates['cardinality_query'].format(attribute = attr, table_name = tbl.table_name) + card_data = pandas.read_sql(card_query,lux.config.SQLconnection,) cardinality[attr] = list(card_data["count"])[0] tbl.cardinality = cardinality @@ -719,13 +646,8 @@ def get_unique_values(self, tbl: LuxSQLTable): """ unique_vals = {} for attr in list(tbl.columns): - unique_query = 'SELECT Distinct("{}") FROM {} WHERE "{}" IS NOT NULL'.format( - attr, tbl.table_name, attr - ) - unique_data = pandas.read_sql( - unique_query, - lux.config.SQLconnection, - ) + unique_query = lux.config.query_templates['unique_query'].format(attribute = attr, table_name = tbl.table_name) + unique_data = pandas.read_sql(unique_query,lux.config.SQLconnection,) unique_vals[attr] = list(unique_data[attr]) tbl.unique_values = unique_vals @@ -751,9 +673,7 @@ def compute_data_type(self, tbl: LuxSQLTable): table_name = tbl.table_name # get the data types of the attributes in the SQL table for attr in list(tbl.columns): - datatype_query = "SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{}' AND COLUMN_NAME = '{}'".format( - table_name, attr - ) + datatype_query = lux.config.query_templates['datatype_query'].format(table_name = table_name, attribute = attr) datatype = list(pandas.read_sql(datatype_query, lux.config.SQLconnection)["data_type"])[0] if str(attr).lower() in {"month", "year"} or "time" in datatype or "date" in datatype: data_type[attr] = "temporal" @@ -774,6 +694,7 @@ def compute_data_type(self, tbl: LuxSQLTable): "smallint", "smallserial", "serial", + "double", "double precision", }: if tbl.cardinality[attr] < 13: diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index 068fa61a..51620de5 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -230,7 +230,7 @@ def deviation_from_overall( vdata = vis.data v_filter_size = get_filtered_size(filter_specs, ldf) v_size = len(vis.data) - elif lux.config.executor.name == "SQLExecutor": + else: from lux.executor.SQLExecutor import SQLExecutor v_filter_size = SQLExecutor.get_filtered_size(filter_specs, ldf) diff --git a/lux/processor/Compiler.py b/lux/processor/Compiler.py index 1179b39c..431e8451 100644 --- a/lux/processor/Compiler.py +++ b/lux/processor/Compiler.py @@ -373,8 +373,7 @@ def line_or_bar_or_geo(ldf, dimension: Clause, measure: Clause): if attr != "Record" and attr in ldf._min_max ) # Replace scatterplot with heatmap - HBIN_START = 5000 - if vis.mark == "scatter" and lux.config.heatmap and len(ldf) > HBIN_START: + if vis.mark == "scatter" and lux.config.heatmap and len(ldf) > lux.config._heatmap_start: vis._postbin = True ldf._message.add_unique( f"Large scatterplots detected: Lux is automatically binning scatterplots to heatmaps.", diff --git a/lux/utils/tracing_utils.py b/lux/utils/tracing_utils.py new file mode 100644 index 00000000..18eefd88 --- /dev/null +++ b/lux/utils/tracing_utils.py @@ -0,0 +1,186 @@ +import inspect +import sys +import pickle as pkl +import lux +import autopep8 +import math +import os + + +class LuxTracer: + def profile_func(self, frame, event, arg): + # Profile functions should have three arguments: frame, event, and arg. + # frame is the current stack frame. + # event is a string: 'call', 'return', 'c_call', 'c_return', or 'c_exception'. + # arg depends on the event type. + # See: https://docs.python.org/3/library/sys.html#sys.settrace + try: + if event == "line": + # frame objects are described here: https://docs.python.org/3/library/inspect.html + fcode = frame.f_code + line_no = frame.f_lineno + func_name = fcode.co_name + + includeMod = [ + os.path.join("lux", "vis"), + os.path.join("lux", "vislib"), + os.path.join("lux", "executor"), + ] + includeFunc = [ + "add_quotes", + "execute", + "execute_sampling", + "execute_filter", + "execute_binning", + "execute_scatter", + "execute_aggregate", + "execute_2D_binning", + ] + if any(x in frame.f_code.co_filename for x in includeMod): + if ( + func_name != "" + ): # ignore one-liner lambda functions (repeated line events) + if any( + x in f"{frame.f_code.co_filename}--{func_name}--{line_no}" + for x in includeFunc + ): + lux.config.tracer_relevant_lines.append( + [frame.f_code.co_filename, func_name, line_no] + ) + # print(f"{frame.f_code.co_filename}--{func_name}--{line_no}") + + except: + # pass # just swallow errors to avoid interference with traced processes + raise # for debugging + return self.profile_func + + def start_tracing(self): + # print ("-----------start_tracing-----------") + # Implement python source debugger: https://docs.python.org/3/library/sys.html#sys.settrace + # setprofile faster than settrace (only go through I/O) + sys.settrace(self.profile_func) + + def stop_tracing(self): + # print ("-----------stop_tracing-----------") + sys.settrace(None) + + def process_executor_code(self, executor_lines): + selected = {} + selected_index = {} + index = 0 + curr_for = "" + curr_for_len = 0 + in_loop = False + loop_end = 0 + output = "" + function_code = "" + + for l in range(0, len(executor_lines)): + line = executor_lines[l] + filename = line[0] + funcname = line[1] + line_no = line[2] - 1 + + codelines = open(filename).readlines() # TODO: do sharing of file content here + if funcname not in ["__init__"]: + code = codelines[line_no] + ignore_construct = [ + "if", + "elif", + "return", + "try", + "assert", + "with", + ] # prune out these control flow programming constructs + ignore_lux_keyword = [ + "self.code", + "self.name", + "__init__", + "'''", + "self.output_type", + "message.add_unique", + "Large scatterplots detected", + "priority=", + "for vis in vislist", + "for view in view_collection", + "execute_aggregate", + "execute_binning", + "execute_2D_binning", + ] # Lux-specific keywords to ignore + whitelist = ['if clause.attribute != "Record":', "bin_attribute ="] + ignore = ignore_construct + ignore_lux_keyword + if not any(construct in code for construct in ignore) or any( + construct in code for construct in whitelist + ): + clean_code_line = codelines[line_no].lstrip() + code_line = codelines[line_no].replace(" ", "", 2) + if clean_code_line.lstrip() not in selected: + if "def add_quotes(var_name):" in clean_code_line: + clean_code_line = ( + "def add_quotes(var_name):\n\treturn '\"' + var_name + '\"'\n" + ) + selected[clean_code_line] = index + selected_index[index] = clean_code_line.lstrip() + else: + leading_spaces = len(code_line) - len(code_line.lstrip()) + num_tabs = math.ceil(leading_spaces / 8) + clean_code_line = "\t" * num_tabs + code_line.lstrip() + if clean_code_line.lstrip() not in selected: + selected_index[index] = clean_code_line + selected[clean_code_line.lstrip()] = index + index += 1 + + curr_executor = lux.config.executor.name + if curr_executor != "PandasExecutor": + import_code = "from lux.utils import utils\nfrom lux.executor.SQLExecutor import SQLExecutor\nimport pandas\nimport math\n" + var_init_code = "tbl = 'insert your LuxSQLTable variable here'\nview = 'insert the name of your Vis object here'\n" + else: + import_code = "from lux.utils import utils\nfrom lux.executor.PandasExecutor import PandasExecutor\nimport pandas\nimport math\n" + var_init_code = "ldf = 'insert your LuxDataFrame variable here'\nvis = 'insert the name of your Vis object here'\nvis._vis_data = ldf\n" + function_code += "\t" + import_code + + # need to do some preprocessing before we give the code to autopep8 for formatting + # there are some cases that the formatter does not handle correctly which we need to handle on our own + prev_line = "" + for key in selected_index.keys(): + line = selected_index[key] + line_stripped = line.lstrip() + leading_spaces = len(line) - len(line_stripped) + if ( + leading_spaces > 0 + and not prev_line.lstrip().startswith("for") + and not line_stripped.startswith("attributes.add") + ): + leading_spaces = leading_spaces - 1 + if prev_line != "": + construct_check = prev_line.lstrip().startswith( + ("if", "else", "with", "for") + ) or line_stripped.startswith(("if", "else", "with", "for")) + prev_leading_spaces = len(prev_line) - len(prev_line.lstrip()) + + if not construct_check: + if prev_leading_spaces < leading_spaces: + leading_spaces = prev_leading_spaces + if "curr_edge" in line: + leading_spaces = leading_spaces + 1 + line = "\t" * leading_spaces + line.lstrip() + function_code += line + prev_line = line + + if curr_executor != "PandasExecutor": + output += "def create_chart_data(tbl, view):\n" + function_code += "\nreturn view._vis_data" + else: + output += "def create_chart_data(ldf, vis):\n" + function_code += "\nreturn vis._vis_data" + + # options = autopep8.parse_args(['--max-line-length', '100000', '-', "--ignore", "E231,E225,E226,E227,E228,E22"]) + # options = autopep8.parse_args(['--max-line-length', '100000', '-', "--ignore", "E101,E128,E211,E22,E27,W191,E231"]) + options = autopep8.parse_args( + ["--max-line-length", "100000", "-", "--select", "E20,E112,E113,E117"] + ) + function_code = autopep8.fix_code(function_code, options) + + for line in function_code.split("\n"): + output += "\t" + line + "\n" + return output diff --git a/lux/utils/utils.py b/lux/utils/utils.py index 0b38da79..639c9778 100644 --- a/lux/utils/utils.py +++ b/lux/utils/utils.py @@ -110,6 +110,10 @@ def check_if_id_like(df, attribute): return high_cardinality and (almost_all_vals_unique or evenly_spaced) +def check_if_id_like_for_sql(df, attribute): + return df.cardinality[attribute] >= 0.98 * len(df) + + def like_nan(val): if isinstance(val, str): return val.lower() == "nan" diff --git a/lux/vis/Vis.py b/lux/vis/Vis.py index ea5eb9f7..a8b17c0f 100644 --- a/lux/vis/Vis.py +++ b/lux/vis/Vis.py @@ -238,7 +238,31 @@ def to_altair(self, standalone=False) -> str: renderer = AltairRenderer(output_type="Altair") self._code = renderer.create_vis(self, standalone) - return self._code + + if lux.config.executor.name == "PandasExecutor": + function_code = "def plot_data(source_df, vis):\n" + function_code += "\timport altair as alt\n" + function_code += "\tvisData = create_chart_data(source_df, vis)\n" + else: + function_code = "def plot_data(tbl, vis):\n" + function_code += "\timport altair as alt\n" + function_code += "\tvisData = create_chart_data(tbl, vis)\n" + + vis_code_lines = self._code.split("\n") + for i in range(2, len(vis_code_lines) - 1): + function_code += "\t" + vis_code_lines[i] + "\n" + function_code += "\treturn chart\n#plot_data(your_df, vis) this creates an Altair plot using your source data and vis specification" + function_code = function_code.replace("alt.Chart(tbl)", "alt.Chart(visData)") + + if "mark_circle" in function_code: + function_code = function_code.replace("plot_data", "plot_scatterplot") + elif "mark_bar" in function_code: + function_code = function_code.replace("plot_data", "plot_barchart") + elif "mark_line" in function_code: + function_code = function_code.replace("plot_data", "plot_linechart") + elif "mark_rect" in function_code: + function_code = function_code.replace("plot_data", "plot_heatmap") + return function_code def to_matplotlib(self) -> str: """ @@ -314,6 +338,21 @@ def to_code(self, language="vegalite", **kwargs): return self.to_matplotlib() elif language == "matplotlib_svg": return self._to_matplotlib_svg() + elif language == "python": + lux.config.tracer.start_tracing() + lux.config.executor.execute(lux.vis.VisList.VisList(input_lst=[self]), self._source) + lux.config.tracer.stop_tracing() + self._trace_code = lux.config.tracer.process_executor_code(lux.config.tracer_relevant_lines) + lux.config.tracer_relevant_lines = [] + return self._trace_code + elif language == "SQL": + if self._query: + return self._query + else: + warnings.warn( + "The data for this Vis was not collected via a SQL database. Use the 'python' parameter to view the code used to generate the data.", + stacklevel=2, + ) else: warnings.warn( "Unsupported plotting backend. Lux currently only support 'altair', 'vegalite', or 'matplotlib'", diff --git a/lux/vislib/altair/AltairRenderer.py b/lux/vislib/altair/AltairRenderer.py index 2137e9e7..81388e52 100644 --- a/lux/vislib/altair/AltairRenderer.py +++ b/lux/vislib/altair/AltairRenderer.py @@ -124,9 +124,17 @@ def create_vis(self, vis, standalone=True): var_name for var_name, var_val in callers_local_vars if var_val is var ] all_vars.extend(possible_vars) + for possible_var in all_vars: + if possible_var[0] != "_": + print(possible_var) + found_variable = [ possible_var for possible_var in all_vars if possible_var[0] != "_" - ][0] + ] + if len(found_variable) > 0: + found_variable = found_variable[0] + else: + found_variable = "df" else: # if vis._source was not set when the Vis was created found_variable = "df" if standalone: diff --git a/pyproject.toml b/pyproject.toml index f8f8a67b..b7a13980 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,3 @@ [tool.black] -line-length = 105 \ No newline at end of file +line-length = 105 +exclude = 'Executor\.py$' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 701d702d..2cac52ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ pandas>=1.2.0,<1.3.0 scikit-learn>=0.22 matplotlib>=3.0.0 lux-widget>=0.1.4 +autopep8>=1.5 iso3166 diff --git a/tests/test_export.py b/tests/test_export.py new file mode 100644 index 00000000..28b7f38e --- /dev/null +++ b/tests/test_export.py @@ -0,0 +1,103 @@ +# Copyright 2019-2020 The Lux Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .context import lux +import pytest +import pandas as pd +import numpy as np +import psycopg2 +from lux.vis.Vis import Vis +from lux.executor.PandasExecutor import PandasExecutor + + +def test_scatter_code_export(global_var): + df = pytest.car_df + + vis = Vis([lux.Clause("Horsepower"), lux.Clause("Acceleration")], df) + PandasExecutor.execute([vis], df) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(df, vis) + except: + assert False + + +def test_color_scatter_code_export(global_var): + df = pytest.car_df + + vis = Vis([lux.Clause("Horsepower"), lux.Clause("Acceleration"), lux.Clause("Origin")], df) + PandasExecutor.execute([vis], df) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(df, vis) + except: + assert False + + +def test_histogram_code_export(global_var): + df = pytest.car_df + + vis = Vis([lux.Clause("Horsepower")], df) + PandasExecutor.execute([vis], df) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(df, vis) + except: + assert False + + +def test_barchart_code_export(global_var): + df = pytest.car_df + + vis = Vis([lux.Clause("Origin")], df) + PandasExecutor.execute([vis], df) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(df, vis) + except: + assert False + + +def test_color_barchart_code_export(global_var): + df = pytest.car_df + + vis = Vis([lux.Clause("Origin"), lux.Clause("Cylinders")], df) + PandasExecutor.execute([vis], df) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(df, vis) + except: + assert False + + +def test_heatmap_code_export(global_var): + df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv") + lux.config._heatmap_start = 100 + + vis = Vis(["price", "longitude"], df) + PandasExecutor.execute([vis], df) + code = vis.to_code("python") + + try: + exec(code, globals()) + create_chart_data(df, vis) + except: + assert False + + lux.config._heatmap_start = 5000 diff --git a/tests_sql/conftest.py b/tests_sql/conftest.py index 8ee3ddbb..0e7e03f9 100644 --- a/tests_sql/conftest.py +++ b/tests_sql/conftest.py @@ -1,9 +1,14 @@ import pytest import pandas as pd +import psycopg2 +import lux @pytest.fixture(scope="session") def global_var(): + connection = psycopg2.connect("host=localhost dbname=postgres user=postgres password=lux") + lux.config.set_SQL_connection(connection) + url = "https://github.com/lux-org/lux-datasets/blob/master/data/olympic.csv?raw=true" pytest.olympic = pd.read_csv(url) pytest.car_df = pd.read_csv("lux/data/car.csv") diff --git a/tests_sql/test_sql_compiler.py b/tests_sql/test_sql_compiler.py index 56d4d0f5..e6306f08 100644 --- a/tests_sql/test_sql_compiler.py +++ b/tests_sql/test_sql_compiler.py @@ -17,13 +17,9 @@ import pandas as pd from lux.vis.Vis import Vis from lux.vis.VisList import VisList -import psycopg2 def test_underspecified_no_vis(global_var, test_recs): - connection = psycopg2.connect("host=localhost dbname=postgres user=postgres password=lux") - lux.config.set_SQL_connection(connection) - no_vis_actions = ["Correlation", "Distribution", "Occurrence", "Temporal"] sql_df = lux.LuxSQLTable(table_name="cars") @@ -38,11 +34,12 @@ def test_underspecified_no_vis(global_var, test_recs): def test_underspecified_single_vis(global_var, test_recs): one_vis_actions = ["Enhance", "Filter", "Generalize"] + lux.config.heatmap = False sql_df = lux.LuxSQLTable(table_name="cars") sql_df.set_intent([lux.Clause(attribute="milespergal"), lux.Clause(attribute="weight")]) test_recs(sql_df, one_vis_actions) assert len(sql_df.current_vis) == 1 - assert sql_df.current_vis[0].mark == "scatter" + # assert sql_df.current_vis[0].mark == "scatter" for attr in sql_df.current_vis[0]._inferred_intent: assert attr.data_model == "measure" for attr in sql_df.current_vis[0]._inferred_intent: @@ -344,28 +341,28 @@ def test_autoencoding_color_line_chart(global_var): check_attribute_on_channel(vis, "origin", "color") -def test_autoencoding_color_scatter_chart(global_var): - # test for sql executor - sql_df = lux.LuxSQLTable(table_name="cars") - vis = Vis( - [ - lux.Clause(attribute="horsepower"), - lux.Clause(attribute="acceleration"), - lux.Clause(attribute="origin"), - ], - sql_df, - ) - check_attribute_on_channel(vis, "origin", "color") - - vis = Vis( - [ - lux.Clause(attribute="horsepower"), - lux.Clause(attribute="acceleration", channel="color"), - lux.Clause(attribute="origin"), - ], - sql_df, - ) - check_attribute_on_channel(vis, "acceleration", "color") +# def test_autoencoding_color_scatter_chart(global_var): +# # test for sql executor +# sql_df = lux.LuxSQLTable(table_name="cars") +# vis = Vis( +# [ +# lux.Clause(attribute="horsepower"), +# lux.Clause(attribute="acceleration"), +# lux.Clause(attribute="origin"), +# ], +# sql_df, +# ) +# check_attribute_on_channel(vis, "origin", "color") + +# vis = Vis( +# [ +# lux.Clause(attribute="horsepower"), +# lux.Clause(attribute="acceleration", channel="color"), +# lux.Clause(attribute="origin"), +# ], +# sql_df, +# ) +# check_attribute_on_channel(vis, "acceleration", "color") def test_populate_options(global_var): diff --git a/tests_sql/test_sql_executor.py b/tests_sql/test_sql_executor.py index 2be37f43..e8e62401 100644 --- a/tests_sql/test_sql_executor.py +++ b/tests_sql/test_sql_executor.py @@ -22,7 +22,7 @@ import psycopg2 -def test_lazy_execution(): +def test_lazy_execution(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") @@ -37,7 +37,7 @@ def test_lazy_execution(): assert type(vis.data) == lux.core.frame.LuxDataFrame -def test_selection(): +def test_selection(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") @@ -50,7 +50,7 @@ def test_selection(): assert all(vislist[2].data.columns == ["year", "acceleration"]) -def test_aggregation(): +def test_aggregation(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") @@ -79,7 +79,7 @@ def test_aggregation(): assert int(result_df[result_df["origin"] == "Europe"]["horsepower"]) == 133 -def test_colored_bar_chart(): +def test_colored_bar_chart(global_var): from lux.vis.Vis import Vis from lux.vis.Vis import Clause @@ -100,7 +100,7 @@ def test_colored_bar_chart(): ) # Not color_carsdinality*group_by_carsdinality since some combinations have 0 values -def test_colored_line_chart(): +def test_colored_line_chart(global_var): from lux.vis.Vis import Vis from lux.vis.Vis import Clause @@ -122,7 +122,7 @@ def test_colored_line_chart(): ) # Not color_carsdinality*group_by_carsdinality since some combinations have 0 values -def test_filter(): +def test_filter(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") @@ -144,7 +144,7 @@ def test_filter(): assert filter_output[1] == ["origin"] -def test_inequalityfilter(): +def test_inequalityfilter(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") @@ -170,7 +170,7 @@ def test_inequalityfilter(): assert filter_output[1] == ["horsepower"] -def test_binning(): +def test_binning(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") @@ -179,7 +179,7 @@ def test_binning(): assert len(vis.data) == nbins -def test_record(): +def test_record(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") @@ -187,7 +187,7 @@ def test_record(): assert len(vis.data) == len(tbl.unique_values["cylinders"]) -def test_filter_aggregation_fillzero_aligned(): +def test_filter_aggregation_fillzero_aligned(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") @@ -202,7 +202,7 @@ def test_filter_aggregation_fillzero_aligned(): assert result[result["cylinders"] == 8]["milespergal"].values[0] == 0 -def test_exclude_attribute(): +def test_exclude_attribute(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") @@ -215,7 +215,7 @@ def test_exclude_attribute(): assert vis.get_attr_by_channel("y")[0].attribute != "year" -def test_null_values(): +def test_null_values(global_var): # checks that the SQLExecutor has filtered out any None or Null values from its metadata tbl = lux.LuxSQLTable() tbl.set_SQL_table("aug_test_table") diff --git a/tests_sql/test_sql_export.py b/tests_sql/test_sql_export.py new file mode 100644 index 00000000..ab21f5cf --- /dev/null +++ b/tests_sql/test_sql_export.py @@ -0,0 +1,108 @@ +# Copyright 2019-2020 The Lux Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .context import lux +import pytest +import pandas as pd +import numpy as np +import psycopg2 +from lux.vis.Vis import Vis +from lux.executor.SQLExecutor import SQLExecutor + + +def test_scatter_code_export(global_var): + tbl = lux.LuxSQLTable() + tbl.set_SQL_table("cars") + + vis = Vis([lux.Clause("horsepower"), lux.Clause("acceleration")], tbl) + SQLExecutor.execute([vis], tbl) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(tbl, vis) + except: + assert False + + +def test_color_scatter_code_export(global_var): + tbl = lux.LuxSQLTable() + tbl.set_SQL_table("cars") + + vis = Vis([lux.Clause("horsepower"), lux.Clause("acceleration"), lux.Clause("origin")], tbl) + SQLExecutor.execute([vis], tbl) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(tbl, vis) + except: + assert False + + +def test_histogram_code_export(global_var): + tbl = lux.LuxSQLTable() + tbl.set_SQL_table("cars") + + vis = Vis([lux.Clause("horsepower")], tbl) + SQLExecutor.execute([vis], tbl) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(tbl, vis) + except: + assert False + + +def test_barchart_code_export(global_var): + tbl = lux.LuxSQLTable() + tbl.set_SQL_table("cars") + + vis = Vis([lux.Clause("origin")], tbl) + SQLExecutor.execute([vis], tbl) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(tbl, vis) + except: + assert False + + +def test_color_barchart_code_export(global_var): + tbl = lux.LuxSQLTable() + tbl.set_SQL_table("cars") + + vis = Vis([lux.Clause("origin"), lux.Clause("cylinders")], tbl) + SQLExecutor.execute([vis], tbl) + code = vis.to_code("python") + try: + exec(code, globals()) + create_chart_data(tbl, vis) + except: + assert False + + +# def test_heatmap_code_export(global_var): +# tbl = lux.LuxSQLTable() +# tbl.set_SQL_table("airbnb") +# lux.config._heatmap_start = 100 + +# vis = Vis(["price", "longitude"], tbl) +# SQLExecutor.execute([vis], tbl) +# code = vis.to_code("python") +# try: +# exec(code, globals()) +# create_chart_data(tbl, vis) +# except: +# assert False + +# lux.config._heatmap_start = 5000