# Data science meet-up: Utrecht 21-June-2018

# `Vaex` demo on Machine Learning

# Goal: predict if a flight is likely to be delayed

In [1]:
import lightgbm
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score

import vaex 
import vaex.ml

### Read in the data

- If `Pandas` can read it, `vaex` can read it too!
- Most efficint is to use the **hdf5** file format
- Easy to convert to **hdf5** with `vaex` even if the data is in multiple text (csv) files

In [2]:
### Read and convert to HDF5 all CSV files in the relevant directory
# ds = vaex.open(path='./airline-data/20*.csv', convert='airline-data.hdf5')


# If the file exists, just open the hdf5 version
ds = vaex.open(path='./airline-data/demo-2008.hdf5')


### What is in the dataset

In [3]:
ds.info()

column,type,unit,description,expression
Year,float64,,,
Month,float64,,,
DayofMonth,float64,,,
DayOfWeek,float64,,,
CRSDepTime,float64,,,
UniqueCarrier,bytes80,,,
FlightNum,float64,,,
DepDelay,float64,,,
Origin,bytes24,,,
Dest,bytes24,,,

#,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,UniqueCarrier,FlightNum,DepDelay,Origin,Dest,Distance,Cancelled,LateAircraftDelay,random_index
0,2008.0,7.0,21.0,1.0,1325.0,b'AS',852.0,7.0,b'HNL',b'SEA',2677.0,0.0,,3837252.0
1,2008.0,12.0,6.0,6.0,1733.0,b'9E',2918.0,-5.0,b'DTW',b'ITH',353.0,0.0,0.0,6177194.0
2,2008.0,2.0,9.0,6.0,1345.0,b'XE',2790.0,50.0,b'RDU',b'EWR',416.0,0.0,48.0,1489537.0
3,2008.0,11.0,30.0,7.0,1802.0,b'US',1511.0,-7.0,b'PHX',b'LAX',370.0,0.0,,6487323.0
4,2008.0,6.0,25.0,3.0,955.0,b'US',320.0,22.0,b'LAS',b'ORD',1515.0,0.0,0.0,762415.0
1401941,2008.0,3.0,27.0,4.0,1645.0,b'AA',743.0,-3.0,b'DFW',b'SNA',1205.0,0.0,,3893528.0
1401942,2008.0,8.0,2.0,6.0,2011.0,b'US',587.0,-7.0,b'PHX',b'GEG',1020.0,0.0,,1165052.0
1401943,2008.0,2.0,22.0,5.0,2045.0,b'WN',1559.0,84.0,b'ALB',b'BWI',288.0,0.0,78.0,4066956.0
1401944,2008.0,3.0,24.0,1.0,1255.0,b'XE',215.0,-3.0,b'COS',b'SMF',911.0,0.0,,5127319.0
1401945,2008.0,10.0,9.0,4.0,635.0,b'WN',1572.0,0.0,b'ORF',b'BWI',159.0,0.0,,2509035.0


### Split the data into train and test samples

In [3]:
# Ordered splitting
ds_train, ds_test = ds.ml.train_test_split(test_size=0.25)

# Random splitting
# ds.split_random(test_size=0.25)



### Check for NaNs

In [4]:
# Check for Nans
print('%-19s %-15s %-10s' % ('Feature', 'NaNs?', 'dtype'))
print()
for i,v in enumerate(ds.column_names):
    try:
        print('%-19s %-15s %-10s' % (v, np.isnan(ds_train.evaluate(v)).any(), ds_train[v].dtype))
    except:
        print('%-19s %-15s %-10s' % (v, 'Not a num type', ds_train[v].dtype))

Feature             NaNs?           dtype     

Year                False           float64   
Month               False           float64   
DayofMonth          False           float64   
DayOfWeek           False           float64   
CRSDepTime          False           float64   
UniqueCarrier       Not a num type  |S10      
FlightNum           False           float64   
DepDelay            True            float64   
Origin              Not a num type  |S3       
Dest                Not a num type  |S3       
Distance            False           float64   
Cancelled           False           float64   
LateAircraftDelay   True            float64   
random_index        False           float64   


What to do with the missing/NaN values?

LateAircraftDelay: NaN if it does not happen, so safe to fill with 0.

DepDelay: one idea is to fill with the mean delay.. but let's first check whether this is different for the cancelled and the departed flights:

### Deal with NaNs

In [5]:
# Fill the NaNs of the "LateAircraftDelay" column with 0s:
ds_train = ds_train.fillna(value=0, column_names=['LateAircraftDelay'])

### Basic stats for the DepDelay:
mean_departed  = ds_train.mean('DepDelay', selection='Cancelled==0')
std_departed   = ds_train.std('DepDelay', selection='Cancelled==0')
mean_cancelled = ds_train.mean('DepDelay', selection='Cancelled==1')
std_cancelled  = ds_train.std('DepDelay', selection='Cancelled==1')
mean_all       = ds_train.mean('DepDelay', selection=None)
std_all        = ds_train.std('DepDelay', selection=None)
print('Mean departure delay for departed flights is %2i +/- %2i minutes.' % (mean_departed, std_departed))
print('Mean departure delay for cancelled flights is %2i +/- %2i minutes.' % (mean_cancelled, std_cancelled))
print('Mean departure delay all flights is %2i +/- %2i minutes.' % (mean_all, std_all))
print()

# For starters, best thing seems to drop the flights with missing DepDelay entry.
ds_train = ds_train.dropna(column_names=['DepDelay'])

Mean departure delay for departed flights is  9 +/- 35 minutes.
Mean departure delay for cancelled flights is 42 +/- 88 minutes.
Mean departure delay all flights is  9 +/- 35 minutes.



### One-hot encoding / Label encoding

In [6]:
# ds_train.ml.one_hot_encoding(expression='UniqueCarrier')
# ds_train.ml.one_hot_encoding(expression='Origin')
# ds_train.ml.one_hot_encoding(expression='Dest')

# Scikit-learn like-API
label_encoder = vaex.ml.LabelEncoder(features=['UniqueCarrier','Origin','Dest'])
label_encoder.fit(ds_train)
ds_train = label_encoder.transform(ds_train)

# # Faster vaex-API
# label_encoder = ds_train.ml.label_encoder(features=['UniqueCarrier','Origin','Dest'])
# label_encoder.transform(ds_train)

### PCA transformation

In [11]:
### Figure out which columns to use for the PCA as input
allcols = np.array(ds_train.get_column_names(virtual=True, hidden=True, strings=True))
exclude = ['Cancelled','__original_DepDelay', '__original_LateAircraftDelay', 'DepDelay',
           'Origin','Dest', 'UniqueCarrier', 'random_index']
pca_features = allcols[np.in1d(allcols, exclude, invert=True)].tolist()

# The PCA: Scikit-learn like API
pca = vaex.ml.PCA(n_components=5, features=pca_features)
pca.fit(ds_train, progress=True)
ds_train = pca.transform(ds_train)

# # The PCA: faster vaex-API
# pca = ds_train.ml.pca(n_components=5, features=pca_features, progress=True)
# ds_train = pca.transform(ds_train)

[########################################]:  100.00% elapsed time  :        1s =  0.0m =  0.0h
[########################################]:  100.00% elapsed time  :        0s =  0.0m =  0.0h
 

### Scaling: MinMaxScaler / StandardScaler

In [13]:
# Features to scale
scale_features = ['PCA_0', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']

# Scaling the data (Standard Scaler): Scikit-learn like API
scaler = vaex.ml.StandardScaler(features=scale_features)
scaler.fit(ds_train)
ds_train = scaler.transform(ds_train)

# # Scaling the data (Standard Scaler): Faster vaex-API
# scaler = ds_train.ml.standard_scaler(features=scale_features)
# ds_train = scaler.transform(ds_train)

## Predictions with `LightGBM`

### Create the target variable

In [14]:
# Significant delay if it is longer than 15 minutes
ds_train['label'] = (ds.DepDelay > 15)*1

In [15]:
# The target label to predict
label = 'label'

# Features to train on
train_features = ['standard_scaled_PCA_0', 'standard_scaled_PCA_1', 
                  'standard_scaled_PCA_2', 'standard_scaled_PCA_3', 
                  'standard_scaled_PCA_4']

# Define the LightGBM parameters
params = {
    'boosting': 'gbdt',
    'max_depth': 15,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'application': 'binary',
    'metric': 'binary_logloss',
    'min_data_in_leaf': 20,
    'subsample': 0.80,
    'colsample_bytree': 0.80,
    'reg_lambda': 1.5,
    'reg_alpha': 0.5,
}

### Training the LightGBM model directly from vaex
booster = ds_train.ml.lightgbm_model(label=label, param=params, num_round=1000, features=train_features)

### Evaluate on the training set

In [16]:
# Classical way - outputs a numpy array
# train_pred = booster.predict(ds_train)

# Create a virtual column with the predicted probabilities of the classes
ds_train_pred = booster.transform(ds_train)

# Still, we can easily extract the numpy array for the vaex dataset:
train_pred = ds_train_pred.lightgbm_prediction.values

# See the performance of the model
print('Performance of the classifier on the training set:')
print('Accuracy:', accuracy_score(ds_train.evaluate('label'), np.round(train_pred).astype(np.int8)))
print('ROC-AUC:', roc_auc_score(ds_train.evaluate('label'), train_pred))

Performance of the classifier on the training set:
Accuracy: 0.816690740441
ROC-AUC: 0.717267970462


## The `vaex` _state_: all the pipeline you need

In [17]:
ds_train_pred.state_write('/Users/jovan/Desktop/vaex-demo/deployable_model.json')