In [1]:
cd ..

/Users/dorislee/Desktop/Research/lux/lux


In [2]:
import lux

### Load in a dataset of 392 different cars from 1970-1982: 

In [3]:
dataset = lux.Dataset("lux/data/car.csv",schema=[{"Year":{"dataType":"date"}}])

In [4]:
dataset.df.head()

Unnamed: 0,Name,MilesPerGal,Cylinders,Displacement,Horsepower,Weight,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130,3504,12.0,1970,USA
1,buick skylark 320,15.0,8,350.0,165,3693,11.5,1970,USA
2,plymouth satellite,18.0,8,318.0,150,3436,11.0,1970,USA
3,amc rebel sst,16.0,8,304.0,150,3433,12.0,1970,USA
4,ford torino,17.0,8,302.0,140,3449,10.5,1970,USA


In [5]:
dobj = lux.DataObj(dataset)

result = dobj.showMore()

result.display()

LuxWidget(recommendations=[{'action': 'Distribution', 'description': 'Show univariate count distributions of d…

### Expect higher acceleration cars means higher horsepower, but we are actually seeing the opposite of that trend.  

### Let's learn more about whether there are additional factors that is affecting this relationship.

In [6]:
dobj = lux.DataObj(dataset,[lux.Column("Acceleration",dataModel="measure"),
                            lux.Column("Horsepower",dataModel="measure")])
result = dobj.showMore()
result.display()

LuxWidget(current_view={'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}, 'mark': {'toolti…

### In Enhance, all the added variable (color), except `MilesPerGal`, shows a trend for the value being higher on the upper-left end, and value decreases towards the bottom-right. This makes sense because cars with higher horsepower and acceleration tend to be these heavyweight cars, so they take up more gas and are less fuel-efficient. 

### Now given these three other variables, let's look at what the  `Displacement` and `Weight` is like for different `Cylinder` cars.

In [7]:
dobj = lux.DataObj(dataset,[lux.Column(["Weight","Displacement"]),lux.Column("Cylinders")])
dobj.display()

LuxWidget(current_view={'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}}, 'data': {'name'…

### The Count distribution shows that there is not a lot of cars with 3 and 5 cylinders, so let's clean the data up to remove those. 

In [8]:
import pandas as pd
dataset.df[dataset.df["Cylinders"]==3]

Unnamed: 0,Name,MilesPerGal,Cylinders,Displacement,Horsepower,Weight,Acceleration,Year,Origin
70,mazda rx2 coupe,19.0,3,70.0,97,2330,13.5,1972,Japan
110,maxda rx3,18.0,3,70.0,90,2124,13.5,1973,Japan
241,mazda rx-4,21.5,3,80.0,110,2720,13.5,1977,Japan
331,mazda rx-7 gs,23.7,3,70.0,100,2420,12.5,1980,Japan


In [9]:
dataset.df[dataset.df["Cylinders"]==5]

Unnamed: 0,Name,MilesPerGal,Cylinders,Displacement,Horsepower,Weight,Acceleration,Year,Origin
272,audi 5000,20.3,5,131.0,103,2830,15.9,1978,Europe
295,mercedes benz 300d,25.4,5,183.0,77,3530,20.1,1979,Europe
325,audi 5000s (diesel),36.4,5,121.0,67,2950,19.9,1980,Europe


In [10]:
newdf = dataset.df[(dataset.df["Cylinders"]!=3) & (dataset.df["Cylinders"]!=5)]
dataset.set_df(newdf)

### So after cleaning up the data, we are able to validate what we saw earlier, which is that Weight and Displacement increases as the number of Cylinders increases.

In [11]:
dobj = lux.DataObj(dataset,[lux.Column(["Weight","Displacement"]),lux.Column("Cylinders")])
dobj.display()

LuxWidget(current_view={'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}}, 'data': {'name'…

### Now we want to understand how these measure variables depend on the dimension variables in our dataset

In [12]:
dobj = lux.DataObj(dataset,[lux.Column(["Weight","Displacement"]),lux.Column("?",dataModel="dimension")])
dobj.display()

LuxWidget(recommendations=[{'action': 'Vis Collection', 'vspec': [{'config': {'view': {'continuousWidth': 400,…

### The `Name` column has too much info, so let's drop that column and add in `Brand` information loaded from an external table. 

In [13]:
brand_df = pd.read_csv("lux/data/car_brand.csv")

In [14]:
brand_df.head()

Unnamed: 0,Name,Brand
0,chevrolet chevelle malibu,chevrolet
1,buick skylark 320,buick
2,plymouth satellite,plymouth
3,amc rebel sst,amc
4,ford torino,ford


In [15]:
dataset.df.head()

Unnamed: 0,Name,MilesPerGal,Cylinders,Displacement,Horsepower,Weight,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130,3504,12.0,1970,USA
1,buick skylark 320,15.0,8,350.0,165,3693,11.5,1970,USA
2,plymouth satellite,18.0,8,318.0,150,3436,11.0,1970,USA
3,amc rebel sst,16.0,8,304.0,150,3433,12.0,1970,USA
4,ford torino,17.0,8,302.0,140,3449,10.5,1970,USA


In [16]:
df = dataset.df.merge(brand_df,how="inner")
df = df.drop(columns=["Name"])
dataset.set_df(df)

### Now we have added in `Brand`, we take a look at basic information about `Brand`

In [17]:
dobj = lux.DataObj(dataset,[lux.Column("Brand")])
dobj.display()

LuxWidget(current_view={'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}}, 'data': {'name'…

### Now going back to what we had earlier:

In [18]:
dobj = lux.DataObj(dataset,[lux.Column(["Weight","Displacement"]),lux.Column("?",dataModel="dimension")])
dobj.display()

LuxWidget(recommendations=[{'action': 'Vis Collection', 'vspec': [{'config': {'view': {'continuousWidth': 400,…

### The `Year` information shows that  `Weight` and `Displacement` has been on the decline over the years. 

### What car `Brand`s are contributing to the decline of `Displacement`?

In [19]:
dobj = lux.DataObj(dataset,[lux.Column("Year"),
                            lux.Column("Displacement"),
                            lux.Row("Brand","?")])
dobj.display()

LuxWidget(current_view={'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}}, 'data': {'name'…

### Pointiac has a really straight decline for displacement, let's search for other patterns that look like that.

In [20]:
query = lux.DataObj(dataset,[lux.Column("Year",channel="x"),
                            lux.Column("Displacement",channel="y"),
                            lux.Row("Brand","pontiac")])

dobj = lux.DataObj(dataset,[lux.Column("Year",channel="x"),
                            lux.Column("Displacement",channel="y"),
                            lux.Row("Brand","?")])
result = dobj.similarPattern(query,topK=5)
result.display(currentView=query)

LuxWidget(current_view={'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}}, 'data': {'name'…