# The power of the expressions

# What happens on the "server" side...



In [1]:
import lightgbm
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score

import vaex 
import vaex.ml

## Read in the relevant files

In [2]:
# ### Read and convert to HDF5 all CSV files in the relevant directory
# ds = vaex.open(path='./airline-data/20*.csv', convert='airline-data.hdf5')

ds = vaex.open('./airline-data/my2008.hdf5')
# ds = vaex.open('/Users/jovan/Desktop/airlines-test.hdf5')

ds.set_active_fraction(value=0.2)
ds = ds.trim()


### Train / test split (identical to the split in the previous notebook)

In [7]:
ds_train, ds_test = ds.ml.train_test_split(test_size=0.25)



In [8]:
### Take a look inside
ds_test

#,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,UniqueCarrier,FlightNum,DepDelay,Origin,Dest,Distance,Cancelled,LateAircraftDelay,random_index
0,2008.0,7.0,21.0,1.0,1325.0,b'AS',852.0,7.0,b'HNL',b'SEA',2677.0,0.0,,3837252.0
1,2008.0,12.0,6.0,6.0,1733.0,b'9E',2918.0,-5.0,b'DTW',b'ITH',353.0,0.0,0.0,6177194.0
2,2008.0,2.0,9.0,6.0,1345.0,b'XE',2790.0,50.0,b'RDU',b'EWR',416.0,0.0,48.0,1489537.0
3,2008.0,11.0,30.0,7.0,1802.0,b'US',1511.0,-7.0,b'PHX',b'LAX',370.0,0.0,,6487323.0
4,2008.0,6.0,25.0,3.0,955.0,b'US',320.0,22.0,b'LAS',b'ORD',1515.0,0.0,0.0,762415.0
350481,2008.0,1.0,8.0,2.0,1530.0,b'XE',2370.0,-2.0,b'CLE',b'MHT',544.0,0.0,,3368336.0
350482,2008.0,9.0,21.0,7.0,1000.0,b'WN',2190.0,-2.0,b'LAX',b'SJC',308.0,0.0,,4427900.0
350483,2008.0,5.0,30.0,5.0,2010.0,b'CO',501.0,11.0,b'EWR',b'FLL',1065.0,0.0,0.0,5655604.0
350484,2008.0,4.0,28.0,1.0,2300.0,b'US',1738.0,,b'PHL',b'BOS',280.0,1.0,,720409.0
350485,2008.0,10.0,21.0,2.0,920.0,b'MQ',3766.0,-3.0,b'ABI',b'DFW',158.0,0.0,,5406704.0


### Load the state _into_ the test set: _everything_ done on the train set is applied seamlessly to the test set

In [9]:
ds_test.state_load('/Users/jovan/Desktop/deployable_model.json')

In [10]:
### Take a look inside
ds_test

#,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,UniqueCarrier,FlightNum,DepDelay,Origin,Dest,Distance,Cancelled,random_index,LateAircraftDelay,label_encoded_UniqueCarrier,label_encoded_Origin,label_encoded_Dest,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4,standard_scaled_PCA_0,standard_scaled_PCA_1,standard_scaled_PCA_2,standard_scaled_PCA_3,standard_scaled_PCA_4,label,lightgbm_prediction
0,2008.0,7.0,21.0,1.0,1325.0,b'AS',852.0,7.0,b'HNL',b'SEA',2677.0,0.0,3837252.0,0.0,3,129,257,-1562.3013429814723,1785.740065860375,-124.58078446717738,46.36674054093148,84.64342090638226,-0.7936537047589853,3.3928475383421186,-0.2687726405135926,0.5715244511817371,1.063459647273336,0,0.1266875884686422
1,2008.0,12.0,6.0,6.0,1733.0,b'9E',2918.0,-5.0,b'DTW',b'ITH',353.0,0.0,6177194.0,0.0,0,87,145,741.1854823227839,-327.57556915042363,-387.77239939200547,-36.15482562292628,41.82610772749638,0.3765244180334988,-0.62238283424388,-0.8365865743150755,-0.4456506243623745,0.5255030727065751,0,0.2205403698233799
2,2008.0,2.0,9.0,6.0,1345.0,b'XE',2790.0,50.0,b'RDU',b'EWR',416.0,0.0,1489537.0,48.0,18,236,97,608.0759184748521,-250.1155574162359,-3.1094668151635214,18.018256126845262,-101.83255339638076,0.3089043657552639,-0.4752113532670742,-0.0067084150262442,0.2220961366705621,-1.2794238483772415,1,0.2432930103394599
3,2008.0,11.0,30.0,7.0,1802.0,b'US',1511.0,-7.0,b'PHX',b'LAX',370.0,0.0,6487323.0,0.0,16,220,158,-659.7818561024641,-465.258973511156,-442.53229964890295,56.31364783974468,-54.9606026704416,-0.335171135057195,-0.8839767853942142,-0.9547264868967384,0.6941317483216197,-0.690524821704491,0,0.3087838606636262
4,2008.0,6.0,25.0,3.0,955.0,b'US',320.0,22.0,b'LAS',b'ORD',1515.0,0.0,762415.0,0.0,16,155,211,-1964.9511300656247,602.7600940994254,332.7643831407328,42.24278814959143,37.22807406300146,-0.9982009879546216,1.1452244033572585,0.717911372192196,0.5206919018227933,0.4677333936620223,1,0.2180929414213601
343621,2008.0,8.0,29.0,5.0,1940.0,b'NW',347.0,-4.0,b'DTW',b'SFO',2079.0,0.0,5906700.0,0.0,12,87,258,-2001.5130971364724,1094.1525023840456,-690.5693689250226,29.183643169068887,112.55112104180738,-1.0167745754058604,2.078853857431924,-1.4898457538022989,0.3597226255522914,1.4140918951719013,0,0.2989093074601004
343622,2008.0,1.0,8.0,2.0,1530.0,b'XE',2370.0,-2.0,b'CLE',b'MHT',544.0,0.0,3368336.0,0.0,18,61,186,176.33113489720688,-182.3617412571648,-193.53971753555453,-22.45792364967265,89.52055512813443,0.0895767382548964,-0.3464813254408615,-0.4175457808261819,-0.2768202452624387,1.124735944753158,0,0.2356001230859758
343623,2008.0,9.0,21.0,7.0,1000.0,b'WN',2190.0,-2.0,b'LAX',b'SJC',308.0,0.0,4427900.0,0.0,17,157,263,24.04971474829048,-396.07844221053654,353.7599978578769,100.68918281886612,65.85058572425268,0.0122173262502301,-0.752536045606916,0.7632076578984369,1.241112255878532,0.8273465310967362,0,0.1457983327098334
343624,2008.0,5.0,30.0,5.0,2010.0,b'CO',501.0,11.0,b'EWR',b'FLL',1065.0,0.0,5655604.0,0.0,5,96,105,-1739.1668376676623,98.90052830071892,-689.1606718094546,-71.69460443079095,8.81799436225712,-0.8835018993677444,0.187907759029956,-1.4868066073957769,-0.8837200754671886,0.1107892506437807,0,0.4377101354625936
343625,2008.0,10.0,21.0,2.0,920.0,b'MQ',3766.0,-3.0,b'ABI',b'DFW',158.0,0.0,5406704.0,0.0,11,1,82,1607.604066158838,-373.1617116579994,428.0092912404298,-135.94874110404936,76.47581122330607,0.8166676221744124,-0.7089950094121751,0.9233943088659052,-1.6757276604282727,0.960841827487468,0,0.0968772552530551


### Check the performance in the standard manner

In [11]:
test_pred = ds_test.lightgbm_prediction.values

print('Performance of the classifier on the test set:')
print('Accuracy:', accuracy_score(ds_test.evaluate('label'), np.round(test_pred).astype(np.int8)))
print('ROC-AUC:', roc_auc_score(ds_test.evaluate('label'), test_pred))

Performance of the classifier on the test set:
Accuracy: 0.815566924505
ROC-AUC: 0.690878473434
