Skip to content

Commit

Permalink
Adding logic for removing invalid views in compiler, and updated comp…
Browse files Browse the repository at this point in the history
…iler tests. (#10)

* adding code in Complier.py that adds a step for removing invalid views during view collection generation(ex. year by year). Adding old test cases back to test_compile.py and adding showMore calls to the test cases.

* Adding/fixing comments about invalid view removal.

* made test_showMore reusable across test modules

Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
  • Loading branch information
jaywoo123 and dorisjlee committed Jun 4, 2020
1 parent a5b295b commit 578d446
Show file tree
Hide file tree
Showing 2 changed files with 211 additions and 119 deletions.
35 changes: 35 additions & 0 deletions lux/compiler/Compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def compile(ldf: LuxDataFrame, viewCollection: ViewCollection, enumerateCollecti
# Output : DataObj/DataObjectCollection
# compiledCollection = []
viewCollection = Compiler.expandUnderspecified(ldf, viewCollection) # autofill data type/model information

viewCollection = Compiler.removeAllInvalid(viewCollection)

for view in viewCollection:
Compiler.determineEncoding(ldf, view) # autofill viz related information
return viewCollection
Expand Down Expand Up @@ -111,6 +114,8 @@ def expandUnderspecified(ldf, viewCollection):
ldf : lux.luxDataFrame.LuxDataFrame
LuxDataFrame with underspecified context
viewCollection : list[lux.view.View]
List of lux.View objects that will have their underspecified Spec details filled out.
Returns
-------
views: list[lux.View]
Expand All @@ -136,6 +141,36 @@ def expandUnderspecified(ldf, viewCollection):
chartTitle = spec.value
view.title = f"{spec.attribute} {spec.filterOp} {chartTitle}"
return views

@staticmethod
def removeAllInvalid(viewCollection):
"""
Given an expanded view collection, remove all views that are invalid.
Currently, the invalid views are ones that contain temporal by temporal attributes or overlapping attributes.
Parameters
----------
viewCollection : list[lux.view.View]
empty list that will be populated with specified lux.View objects.
Returns
-------
views: list[lux.View]
view collection with compiled lux.View objects.
"""
newVC = []

for view in viewCollection:
numTemporalSpecs = 0
attributeSet = set()
for spec in view.specLst:
attributeSet.add(spec.attribute)
if spec.dataType == "temporal":
numTemporalSpecs += 1
allDistinctSpecs = 0 == len(view.specLst) - len(attributeSet)
if numTemporalSpecs <= 1 or allDistinctSpecs:
newVC.append(view)

return ViewCollection(newVC)

@staticmethod
def determineEncoding(ldf: LuxDataFrame,view: View):
'''
Expand Down
295 changes: 176 additions & 119 deletions tests/test_compiler.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,69 @@
from .context import lux
import pytest
import pandas as pd
def test_underspecifiedSingleVis():

def test_underspecifiedNoVis(test_showMore):
noViewActions = ["Correlation", "Distribution", "Category"]
df = pd.read_csv("lux/data/car.csv")
test_showMore(df,noViewActions)
assert len(df.viewCollection)==0

# test only one filter context case.
df.setContext([lux.Spec(attribute = "Origin", filterOp="=",value="USA")])
test_showMore(df,noViewActions)
assert len(df.viewCollection)==0

def test_underspecifiedSingleVis(test_showMore):
oneViewActions = ["Enhance", "Filter", "Generalize"]
df = pd.read_csv("lux/data/car.csv")
df.setContext([lux.Spec(attribute = "MilesPerGal"),lux.Spec(attribute = "Weight")])
assert len(df.viewCollection)==1
assert df.viewCollection[0].mark == "scatter"
for attr in df.viewCollection[0].specLst: assert attr.dataModel=="measure"
for attr in df.viewCollection[0].specLst: assert attr.dataType=="quantitative"
test_showMore(df,oneViewActions)

def test_underspecifiedVisCollection(test_showMore):
multipleViewActions = ["View Collection"]

def test_underspecifiedVisCollection():
df = pd.read_csv("lux/data/car.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y') # change pandas dtype for the column "Year" to datetype

df.setContext([lux.Spec(attribute = ["Horsepower","Weight","Acceleration"]),lux.Spec(attribute = "Year",channel="x")])
assert len(df.viewCollection)==3
assert df.viewCollection[0].mark == "line"
for vc in df.viewCollection:
assert df.viewCollection[0].mark == "line"
for vc in df.viewCollection:
assert (vc.getAttrByChannel("x")[0].attribute == "Year")
test_showMore(df,multipleViewActions)

df.setContext([lux.Spec(attribute = "?"),lux.Spec(attribute = "Year",channel="x")])
assert len(df.viewCollection) == len(list(df.columns))
for vc in df.viewCollection:
assert len(df.viewCollection) == len(list(df.columns))-1 # we remove year by year so its 8 vis instead of 9
for vc in df.viewCollection:
assert (vc.getAttrByChannel("x")[0].attribute == "Year")
test_showMore(df,multipleViewActions)

df.setContext([lux.Spec(attribute = "?",dataType="quantitative"),lux.Spec(attribute = "Year")])
assert len(df.viewCollection) == len([view.getAttrByDataType("quantitative") for view in df.viewCollection]) # should be 5
test_showMore(df,multipleViewActions)

df.setContext([lux.Spec(attribute = "?", dataModel="measure"),lux.Spec(attribute="MilesPerGal",channel="y")])
for vc in df.viewCollection:
for vc in df.viewCollection:
print (vc.getAttrByChannel("y")[0].attribute == "MilesPerGal")

test_showMore(df,multipleViewActions)

df.setContext([lux.Spec(attribute = "?", dataModel="measure"),lux.Spec(attribute = "?", dataModel="measure")])
assert len(df.viewCollection) == len([view.getAttrByDataModel("measure") for view in df.viewCollection]) #should be 25
assert len(df.viewCollection) == len([view.getAttrByDataModel("measure") for view in df.viewCollection]) #should be 25
test_showMore(df,multipleViewActions)

@pytest.fixture
def test_showMore():
def test_showMore_function(df, actions):
df.showMore()
assert (len(df._recInfo) > 0)
for rec in df._recInfo:
assert (rec["action"] in actions)
return test_showMore_function

def test_parse():
df = pd.read_csv("lux/data/car.csv")
df.setContext([lux.Spec("Origin=?"),lux.Spec(attribute = "MilesPerGal")])
Expand All @@ -44,6 +78,11 @@ def test_underspecifiedVisCollection_Zval():
df.setContext([lux.Spec(attribute = "Origin", filterOp="=",value="?"),lux.Spec(attribute = "MilesPerGal")])
assert len(df.viewCollection)==3

#does not work
# df = pd.read_csv("lux/data/cars.csv")
# df.setContext([lux.Spec(attribute = ["Origin","Cylinders"], filterOp="=",value="?"),lux.Spec(attribute = ["Horsepower"]),lux.Spec(attribute = "Weight")])
# assert len(df.viewCollection) == 8

def test_sortBar():
from lux.compiler.Compiler import Compiler
from lux.view.View import View
Expand All @@ -61,119 +100,137 @@ def test_sortBar():
assert view.mark == "bar"
assert view.specLst[1].sort == 'ascending'

def test_specifiedVisCollection():
df = pd.read_csv("lux/data/cars.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y') # change pandas dtype for the column "Year" to datetype

# dobj = lux.DataObj(dataset,[lux.Column("Horsepower"),lux.Column("Brand"),lux.Row("Origin",["Japan","USA"])])
# assert type(dobj.compiled).__name__ == "DataObjCollection"
# assert len(dobj.compiled.collection) == 2
df.setContext(
[lux.Spec(attribute="Horsepower"),lux.Spec(attribute="Brand"), lux.Spec(attribute = "Origin",value=["Japan","USA"])])
assert len(df.viewCollection) == 2

# dobj = lux.DataObj(dataset,[lux.Column(["Horsepower","Weight"]),lux.Column("Brand"),lux.Row("Origin",["Japan","USA"])])
# assert len(dobj.compiled.collection) == 4

# # test ? command
# dobj = lux.DataObj(dataset,[lux.Column(["Horsepower","Weight"]),lux.Column("Brand"),lux.Row("Origin","?")])
# assert len(dobj.compiled.collection) == 6
df.setContext(
[lux.Spec(attribute=["Horsepower","Weight"]),lux.Spec(attribute="Brand"), lux.Spec(attribute = "Origin",value=["Japan","USA"])])
assert len(df.viewCollection) == 4

# # test if z axis has been filtered correctly
# dobj = lux.DataObj(dataset,[lux.Column(["Horsepower","Weight"]),lux.Column("Brand"),lux.Row("Origin",["Japan","USA"])])
# chartTitles = list(dobj.compiled.get("title"))
# assert "Origin=USA" and "Origin=Japan" in chartTitles
# assert "Origin=Europe" not in chartTitles

# # test number of data points makes sense
# dobj = lux.DataObj(dataset,[lux.Column(["Horsepower"]),lux.Column("Brand"),lux.Row("Origin","?")])
# def getNumDataPoints(dObj):
# numRows = getattr(dObj, "dataset").df.shape[0]
# # Might want to write catch error if key not in field
# return numRows
# totalNumRows= sum(list(dobj.compiled.map(getNumDataPoints)))
# assert totalNumRows == 392

# def test_underspecifiedVisCollection_Zattr():
# dataset = lux.Dataset("lux/data/cars.csv",schema=[{"Year":{"dataType":"date"}}])
# dobj = lux.DataObj(dataset,[lux.Column(["Horsepower"]),lux.Column("Weight"),lux.Row(["Origin","Cylinders"],"?")])
# assert len(dobj.compiled.collection) == 8

# def test_specifiedChannelEnforcedVisCollection():
# dataset = lux.Dataset("lux/data/cars.csv",schema=[{"Year":{"dataType":"date"}}])
# dobj = lux.DataObj(dataset,[lux.Column("?",dataModel="measure"),lux.Column("MilesPerGal",channel="x")])
# for di in dobj.compiled.collection:
# assert di.getByColumnName("MilesPerGal")[0].channel == "x"
# def test_autoencodingScatter():
# # No channel specified
# dataset = lux.Dataset("lux/data/cars.csv",schema=[{"Year":{"dataType":"date"}}])
# dobj = lux.DataObj(dataset,[lux.Column("MilesPerGal"),lux.Column("Weight")])
# assert dobj.compiled.getByColumnName("MilesPerGal")[0].channel == "x"
# assert dobj.compiled.getByColumnName("Weight")[0].channel == "y"
# # Partial channel specified
# dobj = lux.DataObj(dataset,[lux.Column("MilesPerGal", channel="y"),lux.Column("Weight")])
# assert dobj.compiled.getByColumnName("MilesPerGal")[0].channel == "y"
# assert dobj.compiled.getByColumnName("Weight")[0].channel == "x"

# # Full channel specified
# dobj = lux.DataObj(dataset,[lux.Column("MilesPerGal", channel="y"),lux.Column("Weight", channel="x")])
# assert dobj.compiled.getByColumnName("MilesPerGal")[0].channel == "y"
# assert dobj.compiled.getByColumnName("Weight")[0].channel == "x"
# # Duplicate channel specified
# with pytest.raises(ValueError):
# # Should throw error because there should not be columns with the same channel specified
# dobj = lux.DataObj(dataset,[lux.Column("MilesPerGal", channel="x"),lux.Column("Weight", channel="x")])
chartTitles = [view.title for view in df.viewCollection.collection]
assert "Origin = USA" and "Origin = Japan" in chartTitles
assert "Origin = Europe" not in chartTitles


def test_specifiedChannelEnforcedVisCollection():
df = pd.read_csv("lux/data/cars.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y') # change pandas dtype for the column "Year" to datetype
df.setContext(
[lux.Spec(attribute="?"),lux.Spec(attribute="MilesPerGal",channel="x")])
for view in df.viewCollection:
checkAttributeOnChannel(view, "MilesPerGal", "x")

def test_autoencodingScatter():
# No channel specified
df = pd.read_csv("lux/data/cars.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y') # change pandas dtype for the column "Year" to datetype
df.setContext([lux.Spec(attribute="MilesPerGal"),lux.Spec(attribute="Weight")])
view = df.viewCollection[0]
checkAttributeOnChannel(view, "MilesPerGal", "x")
checkAttributeOnChannel(view, "Weight", "y")

# Partial channel specified
df.setContext([lux.Spec(attribute="MilesPerGal", channel="y"),lux.Spec(attribute="Weight")])
view = df.viewCollection[0]
checkAttributeOnChannel(view, "MilesPerGal", "y")
checkAttributeOnChannel(view, "Weight", "x")

# Full channel specified
df.setContext([lux.Spec(attribute="MilesPerGal", channel="y"),lux.Spec(attribute="Weight",channel="x")])
view = df.viewCollection[0]
checkAttributeOnChannel(view, "MilesPerGal", "y")
checkAttributeOnChannel(view, "Weight", "x")
# Duplicate channel specified
with pytest.raises(ValueError):
# Should throw error because there should not be columns with the same channel specified
df.setContext([lux.Spec(attribute="MilesPerGal", channel="x"), lux.Spec(attribute="Weight", channel="x")])


# def test_autoencodingHistogram():
# dataset = lux.Dataset("lux/data/cars.csv",schema=[{"Year":{"dataType":"date"}}])

# # Partial channel specified
# dobj = lux.DataObj(dataset,[lux.Column("MilesPerGal",channel="y")])
# assert dobj.compiled.getByColumnName("MilesPerGal")[0].channel == "y"

# dobj = lux.DataObj(dataset,[lux.Column("MilesPerGal", channel="x")])
# assert dobj.compiled.getByColumnName("MilesPerGal")[0].channel == "x"
# assert dobj.compiled.getByColumnName("count()")[0].channel == "y"

# def test_autoencodingLineChart():
# dataset = lux.Dataset("lux/data/cars.csv",schema=[{"Year":{"dataType":"date"}}])
# dobj = lux.DataObj(dataset,[lux.Column("Year"),lux.Column("Acceleration")])
# checkAttributeOnChannel(dobj,"Year","x")
# checkAttributeOnChannel(dobj,"Acceleration","y")
# # Partial channel specified
# dobj = lux.DataObj(dataset,[lux.Column("Year", channel="y"),lux.Column("Acceleration")])
# checkAttributeOnChannel(dobj,"Year","y")
# checkAttributeOnChannel(dobj,"Acceleration","x")

# # Full channel specified
# dobj = lux.DataObj(dataset,[lux.Column("Year", channel="y"),lux.Column("Acceleration", channel="x")])
# checkAttributeOnChannel(dobj,"Year","y")
# checkAttributeOnChannel(dobj,"Acceleration","x")
# # Duplicate channel specified
# with pytest.raises(ValueError):
# # Should throw error because there should not be columns with the same channel specified
# dobj = lux.DataObj(dataset,[lux.Column("Year", channel="x"),lux.Column("Acceleration", channel="x")])

# def test_autoencodingColorLineChart():
# dataset = lux.Dataset("lux/data/cars.csv",schema=[{"Year":{"dataType":"date"}}])
# dobj = lux.DataObj(dataset,[lux.Column("Year"),lux.Column("Acceleration"),lux.Column("Origin")])
# checkAttributeOnChannel(dobj,"Year","x")
# checkAttributeOnChannel(dobj,"Acceleration","y")
# checkAttributeOnChannel(dobj,"Origin","color")
# def test_autoencodingColorScatterChart():
# dataset = lux.Dataset("lux/data/cars.csv",schema=[{"Year":{"dataType":"date"}}])
# dobj = lux.DataObj(dataset,[lux.Column("Horsepower"),lux.Column("Acceleration"),lux.Column("Origin")])
# checkAttributeOnChannel(dobj,"Origin","color")
# dobj = lux.DataObj(dataset,[lux.Column("Horsepower"),lux.Column("Acceleration",channel="color"),lux.Column("Origin")])
# checkAttributeOnChannel(dobj,"Acceleration","color")
# def test_populateOptions():
# from lux.compiler.Compiler import Compiler
# dataset = lux.Dataset("lux/data/cars.csv",schema=[{"Year":{"dataType":"date"}}])
# dobj = lux.DataObj(dataset,[lux.Column("?"),lux.Column("MilesPerGal")])
# colLst = list(map(lambda x: x.columnName, Compiler.populateOptions(dobj, dobj.spec[0])))
# assert listEqual(colLst, list(dobj.dataset.df.columns))
# dobj = lux.DataObj(dataset,[lux.Column("?",dataModel="measure"),lux.Column("MilesPerGal")])
# colLst = list(map(lambda x: x.columnName, Compiler.populateOptions(dobj, dobj.spec[0])))
# assert listEqual(colLst,['Acceleration','Weight','Horsepower','MilesPerGal','Displacement'])

# def listEqual(l1,l2):
# l1.sort()
# l2.sort()
# return l1==l2
# def checkAttributeOnChannel(dobj,attrName,channelName):
# assert dobj.compiled.getByColumnName(attrName)[0].channel == channelName
def test_autoencodingHistogram():
# No channel specified
df = pd.read_csv("lux/data/cars.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y') # change pandas dtype for the column "Year" to datetype
df.setContext([lux.Spec(attribute="MilesPerGal",channel="y")])
view = df.viewCollection[0]
checkAttributeOnChannel(view, "MilesPerGal", "y")

# Record instead of count
# df.setContext([lux.Spec(attribute="MilesPerGal",channel="x")])
# assert df.viewCollection[0].getAttrByChannel("x")[0].attribute == "MilesPerGal"
# assert df.viewCollection[0].getAttrByChannel("y")[0].attribute == "count()"

def test_autoencodingLineChart():
df = pd.read_csv("lux/data/cars.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y') # change pandas dtype for the column "Year" to datetype
df.setContext([lux.Spec(attribute="Year"),lux.Spec(attribute="Acceleration")])
view = df.viewCollection[0]
checkAttributeOnChannel(view, "Year", "x")
checkAttributeOnChannel(view, "Acceleration", "y")

# Partial channel specified
df.setContext([lux.Spec(attribute="Year", channel="y"),lux.Spec(attribute="Acceleration")])
view = df.viewCollection[0]
checkAttributeOnChannel(view, "Year", "y")
checkAttributeOnChannel(view, "Acceleration", "x")

# Full channel specified
df.setContext([lux.Spec(attribute="Year", channel="y"),lux.Spec(attribute="Acceleration", channel="x")])
view = df.viewCollection[0]
checkAttributeOnChannel(view, "Year", "y")
checkAttributeOnChannel(view, "Acceleration", "x")

with pytest.raises(ValueError):
# Should throw error because there should not be columns with the same channel specified
df.setContext([lux.Spec(attribute="Year", channel="x"), lux.Spec(attribute="Acceleration", channel="x")])

def test_autoencodingColorLineChart():
df = pd.read_csv("lux/data/cars.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y') # change pandas dtype for the column "Year" to datetype
df.setContext([lux.Spec(attribute="Year"),lux.Spec(attribute="Acceleration"),lux.Spec(attribute="Origin")])

view = df.viewCollection[0]
checkAttributeOnChannel(view,"Year","x")
checkAttributeOnChannel(view,"Acceleration","y")
checkAttributeOnChannel(view,"Origin","color")

def test_autoencodingColorScatterChart():
df = pd.read_csv("lux/data/cars.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y') # change pandas dtype for the column "Year" to datetype
df.setContext([lux.Spec(attribute="Horsepower"),lux.Spec(attribute="Acceleration"),lux.Spec(attribute="Origin")])
view = df.viewCollection[0]
checkAttributeOnChannel(view,"Origin","color")

df.setContext([lux.Spec(attribute="Horsepower"),lux.Spec(attribute="Acceleration",channel="color"),lux.Spec(attribute="Origin")])
view = df.viewCollection[0]
checkAttributeOnChannel(view,"Acceleration","color")

def test_populateOptions():
from lux.compiler.Compiler import Compiler
df = pd.read_csv("lux/data/cars.csv")
df.setContext([lux.Spec(attribute="?"), lux.Spec(attribute="MilesPerGal")])
colSet = set()
for specOptions in Compiler.populateWildcardOptions(df)["attributes"]:
for spec in specOptions:
colSet.add(spec.attribute)
assert listEqual(list(colSet), list(df.columns))

df.setContext([lux.Spec(attribute="?",dataModel="measure"), lux.Spec(attribute="MilesPerGal")])
colSet = set()
for specOptions in Compiler.populateWildcardOptions(df)["attributes"]:
for spec in specOptions:
colSet.add(spec.attribute)
assert listEqual(list(colSet), ['Acceleration', 'Weight', 'Horsepower', 'MilesPerGal', 'Displacement'])

def listEqual(l1,l2):
l1.sort()
l2.sort()
return l1==l2

def checkAttributeOnChannel(view,attrName,channelName):
assert view.getAttrByChannel(channelName)[0].attribute == attrName

0 comments on commit 578d446

Please sign in to comment.