# Data science meet-up: _PyParis_ 14-November-2018

# `Vaex` demo on Machine Learning

# Goal: predict if a flight is likely to be delayed

In [1]:
import vaex 
import vaex.ml

import numpy as np
import pylab as plt
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve

import warnings
warnings.simplefilter('ignore')

### Read in the data

- If `Pandas` can read it, `vaex` can read it too!
- Most efficint is to use **Apache Arrow** (or **hdf5**) 
- Easy to convert to **Apache Arrow** and **hdf5** with `vaex` even if the data is in multiple text (csv) files

In [2]:
### Read and convert to HDF5 all CSV files in the relevant directory
# df = vaex.open(path='./airline-data/20*.csv', convert='airline-data.hdf5')
# df = vaex.open(path='./airline-data/20*.csv', convert='airline-data.arrow')

# If the file exists, just open the hdf5 version
# df = vaex.open(path='../airline-data/demo-2008.hdf5')

# If the file exists, just open the arrow version
df = vaex.open(path='../airline-data/demo-2008.arrow')

### What is in the dataset

In [3]:
df.info()

column,type,unit,description,expression
Year,float64,,,
Month,float64,,,
DayofMonth,float64,,,
DayOfWeek,float64,,,
CRSDepTime,float64,,,
UniqueCarrier,bytes80,,,
FlightNum,float64,,,
DepDelay,float64,,,
Origin,bytes24,,,
Dest,bytes24,,,

#,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,UniqueCarrier,FlightNum,DepDelay,Origin,Dest,Distance,Cancelled,LateAircraftDelay,random_index
0,2008.0,7.0,21.0,1.0,1325.0,b'AS',852.0,7.0,b'HNL',b'SEA',2677.0,0.0,,3837252.0
1,2008.0,12.0,6.0,6.0,1733.0,b'9E',2918.0,-5.0,b'DTW',b'ITH',353.0,0.0,0.0,6177194.0
2,2008.0,2.0,9.0,6.0,1345.0,b'XE',2790.0,50.0,b'RDU',b'EWR',416.0,0.0,48.0,1489537.0
3,2008.0,11.0,30.0,7.0,1802.0,b'US',1511.0,-7.0,b'PHX',b'LAX',370.0,0.0,,6487323.0
4,2008.0,6.0,25.0,3.0,955.0,b'US',320.0,22.0,b'LAS',b'ORD',1515.0,0.0,0.0,762415.0
1401941,2008.0,3.0,27.0,4.0,1645.0,b'AA',743.0,-3.0,b'DFW',b'SNA',1205.0,0.0,,3893528.0
1401942,2008.0,8.0,2.0,6.0,2011.0,b'US',587.0,-7.0,b'PHX',b'GEG',1020.0,0.0,,1165052.0
1401943,2008.0,2.0,22.0,5.0,2045.0,b'WN',1559.0,84.0,b'ALB',b'BWI',288.0,0.0,78.0,4066956.0
1401944,2008.0,3.0,24.0,1.0,1255.0,b'XE',215.0,-3.0,b'COS',b'SMF',911.0,0.0,,5127319.0
1401945,2008.0,10.0,9.0,4.0,635.0,b'WN',1572.0,0.0,b'ORF',b'BWI',159.0,0.0,,2509035.0


### Split the data into train and test samples

In [4]:
# Ordered splitting
df_train, df_test = df.ml.train_test_split(test_size=0.25)

# Random splitting
# df.split_random(test_size=0.25)

# Export for later - figure out the file format from the extension
df_test.export(path='../airline-data/df_test.arrow')
df_test.export(path='../airline-data/df_test.hdf5')

### View the data types and check for missing values

In [5]:
(df_train.describe())

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,UniqueCarrier,FlightNum,DepDelay,Origin,Dest,Distance,Cancelled,LateAircraftDelay,random_index
dtype,float64,float64,float64,float64,float64,|S10,float64,float64,|S3,|S3,float64,float64,float64,float64
count,1051460,1051460,1051460,1051460,1051460,--,1051460,1030859,--,--,1051460,1051460,228614,1051460
missing,0,0,0,0,0,--,0,20601,--,--,0,0,822846,0
mean,2008,6.37757,15.7405,3.92348,1326.07,--,2223.72,9.95374,--,--,726.714,0.0197506,20.5796,3.50363e+06
std,0,3.40464,8.79158,1.99,463.828,--,1963.05,35.5277,--,--,562.687,0.139142,39.001,2.02306e+06
min,2008,1,1,1,1,--,1,-67,--,--,24,0,0,2
max,2008,12,31,7,2359,--,9743,2467,--,--,4962,1,1254,7.00972e+06


What to do with the missing/NaN values?

LateAircraftDelay: NaN if it does not happen, so safe to fill with 0.

DepDelay: one idea is to fill with the mean delay.. but let's first check whether this is different for the cancelled and the departed flights:

In [6]:
print('Departure delay for CANCELLED flights:')
print(df_train[df_train.Cancelled==1].describe()['DepDelay'])
print()
print('Departure delay for DEPARTED flights:')
print(df_train[df_train.Cancelled==0].describe()['DepDelay'])

Departure delay for CANCELLED flights:
dtype      float64
count          166
missing      20601
mean       42.9398
std          88.74
min            -20
max            676
Name: DepDelay, dtype: object

Departure delay for DEPARTED flights:
dtype      float64
count      1030693
missing          0
mean       9.94843
std        35.5102
min            -67
max           2467
Name: DepDelay, dtype: object


In [7]:
# For starters, best thing seems to drop the flights with missing DepDelay entry.
df_train = df_train.dropna(column_names=['DepDelay'])

### One-hot encoding / Label encoding

In [8]:
# Scikit-learn like-API
label_encoder = vaex.ml.LabelEncoder(features=['UniqueCarrier','Origin','Dest'])
label_encoder.fit(df_train)
df_train = label_encoder.transform(df_train)

# vaex-API (shorter)
# label_encoder = df_train.ml.label_encoder(features=['UniqueCarrier','Origin','Dest'])
# label_encoder.transform(df_train)

### PCA transformation

In [9]:
### Figure out which columns to use for the PCA as input
allcols = np.array(df_train.get_column_names(virtual=True, strings=True))
exclude = ['Cancelled', 'DepDelay','Origin','Dest', 'UniqueCarrier', 'random_index', 'LateAircraftDelay']
pca_features = allcols[np.in1d(allcols, exclude, invert=True)].tolist()

# The PCA: Scikit-learn like API
pca = vaex.ml.PCA(n_components=5, features=pca_features)
pca.fit(df_train, progress=True)
df_train = pca.transform(df_train)

# # The PCA: faster vaex-API
# pca = df_train.ml.pca(n_components=5, features=pca_features, progress=True)
# df_train = pca.transform(df_train)

[########################################]:  100.00% elapsed time  :        0s =  0.0m =  0.0h
[########################################]:  100.00% elapsed time  :        0s =  0.0m =  0.0h
 

### Scaling: MinMaxScaler / StandardScaler

In [10]:
# Features to scale
scale_features = ['PCA_0', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']

# Scaling the data (Standard Scaler): Scikit-learn like API
scaler = vaex.ml.StandardScaler(features=scale_features)
scaler.fit(df_train)
df_train = scaler.transform(df_train)

# # Scaling the data (Standard Scaler): vaex-API (Shorter)
# scaler = df_train.ml.standard_scaler(features=scale_features)
# df_train = scaler.transform(df_train)

In [11]:
df_train.head(10)

#,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,UniqueCarrier,FlightNum,DepDelay,Origin,Dest,Distance,Cancelled,LateAircraftDelay,random_index,label_encoded_UniqueCarrier,label_encoded_Origin,label_encoded_Dest,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4,standard_scaled_PCA_0,standard_scaled_PCA_1,standard_scaled_PCA_2,standard_scaled_PCA_3,standard_scaled_PCA_4
0,2008.0,11.0,26.0,3.0,700.0,b'F9',769.0,-9.0,b'DEN',b'LAS',629.0,0.0,,85996.0,8,80,156,1422.2623594867482,210.5541921183016,-645.135242667279,35.437974809160856,-62.90226390302372,0.7225134901882433,0.4000461101707819,-1.3918497089354378,0.4368157962340757,-0.7903074867926151
1,2008.0,2.0,28.0,4.0,1155.0,b'WN',1594.0,8.0,b'PHX',b'SAN',304.0,0.0,,4180892.0,17,220,248,568.5134813544995,474.6332765407769,-206.04052359437105,-127.96217740255057,-5.867351728151881,0.2888065319964867,0.901787773149954,-0.4445229834415241,-1.5772882257225338,-0.073717728276885
2,2008.0,2.0,25.0,1.0,1715.0,b'DL',1810.0,-9.0,b'LAX',b'SLC',590.0,0.0,,3909313.0,6,157,266,386.0532553341304,208.38632877719544,375.4701034717029,-99.01293004608692,-62.33876020134211,0.1961161968813213,0.3959272404001716,0.810059534292986,-1.220453824138108,-0.7832275954398824
3,2008.0,10.0,19.0,7.0,1600.0,b'CO',1676.0,1.0,b'IAH',b'PHL',1324.0,0.0,,412797.0,5,136,220,597.9956771641521,-512.6304567447355,311.260200121125,-39.98495842474462,-52.20150835248582,0.303783574769757,-0.9739811784919448,0.6715296116061142,-0.4928628553338266,-0.6558626083226493
4,2008.0,2.0,14.0,4.0,610.0,b'OO',4087.0,21.0,b'MSP',b'SLC',991.0,0.0,0.0,25735.0,14,199,266,-1837.380497421412,-513.0826591974245,-674.8343655813935,-119.98099759241988,-38.60425063610675,-0.9333947334968644,-0.9748403484300396,-1.4559242050251933,-1.4789105550902848,-0.4850259180936594
5,2008.0,2.0,4.0,1.0,1303.0,b'EV',4453.0,-7.0,b'CAE',b'ATL',191.0,0.0,,2731238.0,7,50,18,-2286.029131265478,294.2558596541588,-37.15385680880807,156.3135516579839,4.689002312348421,-1.161309568017157,0.5590765534768649,-0.0801577426949507,1.92675311998946,0.0589128817167852
6,2008.0,1.0,6.0,7.0,1440.0,b'WN',2032.0,26.0,b'LAS',b'BNA',1588.0,0.0,,3445562.0,17,155,35,271.91023851084714,-822.4902858551108,172.91294282261285,96.82883238429784,75.7381900500801,0.1381312063375466,-1.5627047659289588,0.3730517467063041,1.1935321852937586,0.9515787655112196
7,2008.0,12.0,10.0,3.0,859.0,b'B6',717.0,12.0,b'JFK',b'SJU',1597.0,0.0,,5415374.0,4,150,265,1578.902463088658,-734.0140644677964,-418.2290040369873,-80.33060994625228,-75.4594548362393,0.802087126656908,-1.394602825746655,-0.902309901921536,-0.990171688269566,-0.9480767210272444
8,2008.0,7.0,22.0,2.0,2020.0,b'AA',1952.0,39.0,b'SJU',b'FLL',1046.0,0.0,0.0,5562161.0,1,264,105,294.7874934371267,-236.7073509176029,711.9523668137422,-32.67672973996927,123.25936409443231,0.1497529195836718,-0.4497362604407932,1.5360045909577822,-0.4027801192522032,1.5486373973965042
9,2008.0,5.0,31.0,6.0,940.0,b'US',1433.0,0.0,b'CLT',b'LAX',2125.0,0.0,,3974301.0,16,63,158,923.9832067002452,-1326.72311129501,-292.71761488794465,64.50374080539413,-77.42215125647633,0.4693862050804975,-2.5207307183370613,-0.6315248340760433,0.7950864306360139,-0.9727361462842712


## Predictions with `LightGBM`

### Create the target variable

In [12]:
# Significant delay if it is longer than 15 minutes
df_train['label'] = (df.DepDelay > 15)*1

In [13]:
import vaex.ml.lightgbm

# The target label to predict
label = 'label'

# Features to train on
train_features = ['standard_scaled_PCA_0', 'standard_scaled_PCA_1', 
                  'standard_scaled_PCA_2', 'standard_scaled_PCA_3', 
                  'standard_scaled_PCA_4']

# Define the LightGBM parameters
params = {
    'boosting': 'gbdt',
    'max_depth': 15,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'application': 'binary',
    'metric': 'binary_logloss',
    'min_data_in_leaf': 20,
    'subsample': 0.80,
    'colsample_bytree': 0.80,
    'reg_lambda': 1.5,
    'reg_alpha': 0.5,
}

# Training the LightGBM model: Scikit-learn like API
booster = vaex.ml.lightgbm.LightGBMModel(features=train_features, num_round=1000, param=params)
booster.fit(df_train, label=df_train['label'])

# # Training the LightGBM model directly from vaex
# booster = df_train.ml.lightgbm_model(label=label, param=params, num_round=1000, features=train_features)

### Evaluate on the training set

In [14]:
# Classical way - outputs a numpy array
# train_pred = booster.predict(df_train)

# Create a virtual column with the predicted probabilities of the classes
df_train_pred = booster.transform(df_train)

# Still, we can easily extract the numpy array for the vaex dataset:
train_pred = df_train_pred.lightgbm_prediction.values

# See the performance of the model
print('Performance of the classifier on the training set:')
print('Accuracy:', accuracy_score(df_train.evaluate('label'), np.round(train_pred).astype(np.int8)))
print('ROC-AUC:', roc_auc_score(df_train.evaluate('label'), train_pred))

Performance of the classifier on the training set:
Accuracy: 0.8152317630248171
ROC-AUC: 0.7130886334911247


## The `vaex` _state_: all the pipeline you need

In [15]:
df_train_pred.state_write('/Users/jovan/Desktop/vaex-demo/deployable_model.json')