diff --git a/examples/Postgres_Executor_Example.ipynb b/examples/Postgres_Executor_Example.ipynb index 5220e8ab..a3bdf3d7 100644 --- a/examples/Postgres_Executor_Example.ipynb +++ b/examples/Postgres_Executor_Example.ipynb @@ -13,7 +13,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can upload the car dataset to a local postgres database using the upload_car_data.py script in the lux/data folder. You will need to update the name of the database and the login credentials in that file." + "This demo requires that you have a local posgres SQL database already set up. If you have not done this yet, you can download the PostgreSQL installer here: https://www.postgresql.org/download/. Follow the instructions to get your database environment setup." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have your PostgreSQL environment set up, you can upload the example [car dataset](https://github.com/lux-org/lux-datasets/blob/master/data/car.csv) to your database using the script found [here](https://github.com/thyneb19/lux/blob/Database-Executor/lux/data/upload_car_data.py)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To connect Lux to your PostgreSQL database, you will first need to create a psycopg2 connection. After that you will be able to specify this connection in the Lux config, and connect a Lux DataFrame to a table as shown below." ] }, { @@ -26,11 +40,18 @@ "import psycopg2\n", "import pandas as pd\n", "\n", - "connection = psycopg2.connect(\"host=localhost dbname=postgres_db user=postgres password=lux\")\n", + "connection = psycopg2.connect(\"host=localhost dbname=postgres user=postgres password=lux\")\n", "\n", "sql_df = lux.LuxDataFrame()\n", - "sql_df.set_SQL_connection(connection, \"car\")\n", - "sql_df" + "lux.config.set_SQL_connection(connection)\n", + "sql_df.set_SQL_table(\"cars\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the Lux Dataframe has been connected to a database table, the parameters necessary to run Lux' recommendation system will automatically be populated." ] }, { @@ -39,19 +60,51 @@ "metadata": {}, "outputs": [], "source": [ + "#you can view the variable datatypes here\n", "sql_df.data_type" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that the connection between your DataFrame and your database has been established, you can leverage all of Lux's visual recommendation tools. For a more in-depth look at Lux's functions, check out the main repository [here](https://github.com/lux-org/lux)." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "#call the Lux DataFrame to view general variable distributions and relationships.\n", + "#You will see that the DataFrame contains the columns of your database table, but is otherwise empty.\n", + "#Data is processed as much as possible on the database end, and is only brought in locally when needed to create visualizations.\n", + "sql_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#you can specify intents just the same as the default Lux system\n", + "from lux.vis import Clause\n", + "\n", + "#here we specify that we are interested in a graph containing the variables 'milespergal' and 'cylinders'\n", + "#we also specify that we want to apply a filter 'horsepower > 150' to this visualization\n", "sql_df.set_intent([\"milespergal\", 'cylinders', Clause(attribute =\"horsepower\", filter_op=\">\", value=150)])\n", "sql_df" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also use Lux's Vis package to generate visualizations without having to pull in or process data from your database manually. Instead, you can specify visualization channels and create graphs as shown below." + ] + }, { "cell_type": "code", "execution_count": null, @@ -61,12 +114,21 @@ "from lux.vis.Vis import Vis\n", "from lux.vis.Vis import Clause\n", "\n", - "x_clause = Clause(attribute = \"milespergal\", channel = \"x\")\n", - "y_clause = Clause(attribute = \"weight\", channel = \"y\")\n", + "#Create a new Lux Clause for each variable you want to use in your graph\n", + "#Specify how you want to use the variable in the graph via the channel parameter.\n", + "#The channel parameter will specify whether or not a variable is used on the x or y axis, or used to color datapoints\n", + "x_clause = Clause(attribute = \"acceleration\", channel = \"x\")\n", + "y_clause = Clause(attribute = \"milespergal\", channel = \"y\")\n", "color_clause = Clause(attribute = 'cylinders', channel = \"color\")\n", - "filter_clause = Clause(attribute =\"horsepower\", filter_op=\">\", value=150)\n", "\n", - "new_vis = Vis([x_clause, y_clause, color_clause])\n", + "#you can also create filters on your data using Lux Clauses like so\n", + "filter_clause = Clause(attribute =\"origin\", filter_op=\"=\", value='USA')\n", + "\n", + "#to create the graph, create a Lux Vis object with the list of your Clauses as the parameter\n", + "new_vis = Vis([x_clause, y_clause, color_clause, filter_clause])\n", + "\n", + "#to fetch the data necessary for the graph, use the refresh_source function.\n", + "#the refresh_source function takes in a Lux DataFrame, in this case you can specify the one connected to your database table\n", "new_vis.refresh_source(sql_df)\n", "new_vis" ] diff --git a/lux/data/upload_car_data.py b/lux/data/upload_car_data.py index 4c853324..af50bb66 100644 --- a/lux/data/upload_car_data.py +++ b/lux/data/upload_car_data.py @@ -2,17 +2,17 @@ import psycopg2 import csv -conn = psycopg2.connect("host=localhost dbname=postgres_db user=postgres password=lux") +conn = psycopg2.connect("host=localhost dbname=postgres user=postgres password=lux") cur = conn.cursor() cur.execute( """ - DROP TABLE IF EXISTS car + DROP TABLE IF EXISTS cars """ ) # create car table in postgres database cur.execute( """ - CREATE TABLE car( + CREATE TABLE cars( name text, milespergal numeric, cylinders integer, @@ -28,11 +28,14 @@ ) # open car.csv and read data into database -with open("lux/data/car.csv", "r") as f: - reader = csv.reader(f) - next(reader) # Skip the header row. - i = 0 - for row in reader: - cur.execute("INSERT INTO car VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", row) +import urllib.request +target_url = "https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/car.csv" +for line in urllib.request.urlopen(target_url): + decoded = line.decode("utf-8") + print(decoded.split(",")) + if "Name,MilesPerGal,Cylinders" not in decoded: + cur.execute( + "INSERT INTO cars VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", decoded.split(",") + ) conn.commit() diff --git a/lux/executor/SQLExecutor.py b/lux/executor/SQLExecutor.py index 05188238..868d1c01 100644 --- a/lux/executor/SQLExecutor.py +++ b/lux/executor/SQLExecutor.py @@ -28,19 +28,13 @@ def __repr__(self): def execute(view_collection: VisList, ldf: LuxDataFrame): """ Given a VisList, fetch the data required to render the view - 1) Apply filters - 2) Retreive relevant attribute - 3) return a DataFrame with relevant results + 1) Generate Necessary WHERE clauses + 2) Query necessary data, applying appropriate aggregation for the chart type + 3) populates vis' data with a DataFrame with relevant results """ for view in view_collection: - # Select relevant data based on attribute information - attributes = set([]) - for clause in view._inferred_intent: - if clause.attribute: - if clause.attribute != "Record": - attributes.add(clause.attribute) - # for lazy execution: if mark is not specified, need to compile the vis before execution + # choose execution method depending on vis mark type if view.mark == "": view.refresh_source(ldf) elif view.mark == "scatter": @@ -61,6 +55,26 @@ def execute(view_collection: VisList, ldf: LuxDataFrame): @staticmethod def execute_scatter(view: Vis, ldf: LuxDataFrame): + """ + Given a scatterplot vis and a Lux Dataframe, fetch the data required to render the vis. + 1) Generate WHERE clause for the SQL query + 2) Check number of datapoints to be included in the query + 3) If the number of datapoints exceeds 10000, perform a random sample from the original data + 4) Query datapoints needed for the scatterplot visualization + 5) return a DataFrame with relevant results + + Parameters + ---------- + vislist: list[lux.Vis] + vis list that contains lux.Vis objects for visualization. + ldf : lux.core.frame + LuxDataFrame with specified intent. + + Returns + ------- + None + """ + attributes = set([]) for clause in view._inferred_intent: if clause.attribute: @@ -106,7 +120,7 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): ldf : lux.core.frame LuxDataFrame with specified intent. isFiltered: boolean - boolean that represents whether a vis has had a filter applied + boolean that represents whether a vis has had a filter applied to its data Returns ------- None @@ -146,6 +160,7 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): "SELECT COUNT(*) as length FROM {} {}".format(ldf.table_name, where_clause), lux.config.SQLconnection, ) + # generates query for colored barchart case if has_color: count_query = "SELECT {}, {}, COUNT({}) FROM {} {} GROUP BY {}, {}".format( groupby_attr.attribute, @@ -159,6 +174,7 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection) view._vis_data = view._vis_data.rename(columns={"count": "Record"}) view._vis_data = utils.pandas_to_lux(view._vis_data) + # generates query for normal barchart case else: count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format( groupby_attr.attribute, @@ -171,7 +187,7 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): view._vis_data = view._vis_data.rename(columns={"count": "Record"}) view._vis_data = utils.pandas_to_lux(view._vis_data) view._vis_data.length = list(length_query["length"])[0] - # aggregate barchart case, need aggregate data for each group + # aggregate barchart case, need aggregate data (mean, sum, max) for each group else: where_clause, filterVars = SQLExecutor.execute_filter(view) @@ -179,9 +195,10 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): "SELECT COUNT(*) as length FROM {} {}".format(ldf.table_name, where_clause), lux.config.SQLconnection, ) + # generates query for colored barchart case if has_color: if agg_func == "mean": - mean_query = "SELECT {}, {}, AVG({}) as {} FROM {} {} GROUP BY {}, {}".format( + agg_query = "SELECT {}, {}, AVG({}) as {} FROM {} {} GROUP BY {}, {}".format( groupby_attr.attribute, color_attr.attribute, measure_attr.attribute, @@ -191,11 +208,11 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): groupby_attr.attribute, color_attr.attribute, ) - view._vis_data = pandas.read_sql(mean_query, lux.config.SQLconnection) + view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "sum": - mean_query = "SELECT {}, {}, SUM({}) as {} FROM {} {} GROUP BY {}, {}".format( + agg_query = "SELECT {}, {}, SUM({}) as {} FROM {} {} GROUP BY {}, {}".format( groupby_attr.attribute, color_attr.attribute, measure_attr.attribute, @@ -205,10 +222,10 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): groupby_attr.attribute, color_attr.attribute, ) - view._vis_data = pandas.read_sql(mean_query, lux.config.SQLconnection) + view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "max": - mean_query = "SELECT {}, {}, MAX({}) as {} FROM {} {} GROUP BY {}, {}".format( + agg_query = "SELECT {}, {}, MAX({}) as {} FROM {} {} GROUP BY {}, {}".format( groupby_attr.attribute, color_attr.attribute, measure_attr.attribute, @@ -218,11 +235,12 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): groupby_attr.attribute, color_attr.attribute, ) - view._vis_data = pandas.read_sql(mean_query, lux.config.SQLconnection) + view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) + # generates query for normal barchart case else: if agg_func == "mean": - mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format( + agg_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, @@ -230,10 +248,10 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): where_clause, groupby_attr.attribute, ) - view._vis_data = pandas.read_sql(mean_query, lux.config.SQLconnection) + view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "sum": - mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format( + agg_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, @@ -241,10 +259,10 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): where_clause, groupby_attr.attribute, ) - view._vis_data = pandas.read_sql(mean_query, lux.config.SQLconnection) + view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "max": - mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format( + agg_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, @@ -252,7 +270,7 @@ def execute_aggregate(view: Vis, ldf: LuxDataFrame, isFiltered=True): where_clause, groupby_attr.attribute, ) - view._vis_data = pandas.read_sql(mean_query, lux.config.SQLconnection) + view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) result_vals = list(view._vis_data[groupby_attr.attribute]) # create existing group by attribute combinations if color is specified @@ -389,6 +407,18 @@ def execute_binning(view: Vis, ldf: LuxDataFrame): @staticmethod def execute_2D_binning(view: Vis, ldf: LuxDataFrame): + """ + Binning of data points for generating 2D heatmaps + Parameters + ---------- + vis: lux.Vis + lux.Vis object that represents a visualization + ldf : lux.core.frame + LuxDataFrame with specified intent. + Returns + ------- + None + """ import numpy as np x_attribute = list(filter(lambda x: x.channel == "x", view._inferred_intent))[0] @@ -407,11 +437,6 @@ def execute_2D_binning(view: Vis, ldf: LuxDataFrame): # get filters if available where_clause, filterVars = SQLExecutor.execute_filter(view) - # length_query = pandas.read_sql( - # "SELECT COUNT(*) as length FROM {} {}".format(ldf.table_name, where_clause), - # lux.config.SQLconnection, - # ) - # need to calculate the bin edges before querying for the relevant data x_bin_width = (x_attr_max - x_attr_min) / num_bins y_bin_width = (y_attr_max - y_attr_min) / num_bins @@ -476,10 +501,10 @@ def execute_2D_binning(view: Vis, ldf: LuxDataFrame): view._vis_data = utils.pandas_to_lux(output) @staticmethod - # takes in a view and returns an appropriate SQL WHERE clause that based on the filters specified in the view's _inferred_intent def execute_filter(view: Vis): """ - Helper function for converting a filter specification to a SQL where clause + Helper function to convert a Vis' filter specification to a SQL where clause. + Takes in a Vis object and returns an appropriate SQL WHERE clause based on the filters specified in the vis' _inferred_intent. Parameters ---------- @@ -488,8 +513,10 @@ def execute_filter(view: Vis): Returns ------- - list: list of lists - List containing the list of components to be used in the SQL where clause, and the list of variables used in this clause + where_clause: string + String representation of a SQL WHERE clause + filter_vars: list of strings + list of variables that have been used as filters """ where_clause = [] filters = utils.get_filter_specs(view._inferred_intent) @@ -516,10 +543,23 @@ def execute_filter(view: Vis): return (where_clause, filter_vars) ####################################################### - ########## Metadata, type, model schema ########### + ########## Metadata, type, model schema ############### ####################################################### def compute_dataset_metadata(self, ldf: LuxDataFrame): + """ + Function which computes the metadata required for the Lux recommendation system. + Populates the metadata parameters of the specified Lux DataFrame. + + Parameters + ---------- + ldf: lux.LuxDataFrame + lux.LuxDataFrame object whose metadata will be calculated + + Returns + ------- + None + """ self.get_SQL_attributes(ldf) for attr in list(ldf.columns): ldf[attr] = None @@ -534,6 +574,19 @@ def compute_dataset_metadata(self, ldf: LuxDataFrame): self.compute_data_model(ldf) def get_SQL_attributes(self, ldf: LuxDataFrame): + """ + Retrieves the names of variables within a specified Lux DataFrame's Postgres SQL table. + Uses these variables to populate the Lux DataFrame's columns list. + + Parameters + ---------- + ldf: lux.LuxDataFrame + lux.LuxDataFrame object whose columns will be populated + + Returns + ------- + None + """ if "." in ldf.table_name: table_name = ldf.table_name[self.table_name.index(".") + 1 :] else: @@ -546,6 +599,19 @@ def get_SQL_attributes(self, ldf: LuxDataFrame): ldf[attr] = None def compute_stats(self, ldf: LuxDataFrame): + """ + Function which computes the min and max values for each variable within the specified Lux DataFrame's SQL table. + Populates the metadata parameters of the specified Lux DataFrame. + + Parameters + ---------- + ldf: lux.LuxDataFrame + lux.LuxDataFrame object whose metadata will be calculated + + Returns + ------- + None + """ # precompute statistics ldf.unique_values = {} ldf._min_max = {} @@ -571,6 +637,19 @@ def compute_stats(self, ldf: LuxDataFrame): ) def get_cardinality(self, ldf: LuxDataFrame): + """ + Function which computes the cardinality for each variable within the specified Lux DataFrame's SQL table. + Populates the metadata parameters of the specified Lux DataFrame. + + Parameters + ---------- + ldf: lux.LuxDataFrame + lux.LuxDataFrame object whose metadata will be calculated + + Returns + ------- + None + """ cardinality = {} for attr in list(ldf.columns): card_query = pandas.read_sql( @@ -582,6 +661,19 @@ def get_cardinality(self, ldf: LuxDataFrame): ldf.cardinality = cardinality def get_unique_values(self, ldf: LuxDataFrame): + """ + Function which collects the unique values for each variable within the specified Lux DataFrame's SQL table. + Populates the metadata parameters of the specified Lux DataFrame. + + Parameters + ---------- + ldf: lux.LuxDataFrame + lux.LuxDataFrame object whose metadata will be calculated + + Returns + ------- + None + """ unique_vals = {} for attr in list(ldf.columns): unique_query = pandas.read_sql( @@ -593,6 +685,19 @@ def get_unique_values(self, ldf: LuxDataFrame): ldf.unique_values = unique_vals def compute_data_type(self, ldf: LuxDataFrame): + """ + Function which the equivalent Pandas data type of each variable within the specified Lux DataFrame's SQL table. + Populates the metadata parameters of the specified Lux DataFrame. + + Parameters + ---------- + ldf: lux.LuxDataFrame + lux.LuxDataFrame object whose metadata will be calculated + + Returns + ------- + None + """ data_type_lookup = {} sql_dtypes = {} self.get_cardinality(ldf) @@ -654,6 +759,20 @@ def compute_data_type(self, ldf: LuxDataFrame): ldf.data_type = data_type def compute_data_model(self, ldf: LuxDataFrame): + """ + Function which computes the data models for each variable within the specified Lux DataFrame's SQL table. + Also creates a reverse look up table for variables' data models. + Populates the metadata parameters of the specified Lux DataFrame. + + Parameters + ---------- + ldf: lux.LuxDataFrame + lux.LuxDataFrame object whose metadata will be calculated + + Returns + ------- + None + """ ldf.data_model = { "measure": ldf.data_type["quantitative"], "dimension": ldf.data_type["ordinal"] + ldf.data_type["nominal"] + ldf.data_type["temporal"], diff --git a/lux/processor/Validator.py b/lux/processor/Validator.py index 5b18ed45..dbc1912f 100644 --- a/lux/processor/Validator.py +++ b/lux/processor/Validator.py @@ -93,7 +93,10 @@ def validate_clause(clause): else: vals = [clause.value] for val in vals: - if val not in series.values: + if ( + lux.config.executor.name == "PandasExecutor" + and val not in series.values + ): warn_msg = f"\n- The input value '{val}' does not exist for the attribute '{clause.attribute}' for the DataFrame." return warn_msg