# Out of core dataframes for Python
## Datascience meetup 

In [None]:
import vaex
import numpy as np
import matplotlib.pylab as plt
np.warnings.filterwarnings('ignore')
%matplotlib inline

# Step 0: reading in data
vaex reads 'anything':
 * `ds = vaex.open('super_fast.hdf5')`
 * `ds = vaex.from_pandas(df)`
 * `ds = vaex.from_ascii('takes_hours.asc')`
 * `ds = vaex.from_csv('this_may_be_slow.csv')`
 * `ds = vaex.from_arrays(x=x, y=y)`

In [None]:
%%time
ds = vaex.open("/Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5")
!ls -lh /Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5

In [None]:
ds

## Pandas like, but uses expressions

In [None]:
ds.trip_distance.values

In [None]:
bytes_per_column = ds.trip_distance.values.dtype.itemsize * len(ds)
f"{bytes_per_column:,}"

In [None]:
ds.trip_distance

In [None]:
np.log10(ds.trip_distance)

# Virtual columns
![dsa](./meme-expressions.jpg)

In [None]:
ds.mean(ds.tip_amount)

In [None]:
ds.tip_amount/ds.total_amount

In [None]:
ds.mean(ds.tip_amount/ds.total_amount)

In [None]:
ds['tip_percentage'] = ds.tip_amount/ds.total_amount

In [None]:
ds.info()

In [None]:
ds.mean(ds.tip_percentage)

# Lazy is good
See item 10/11 of ["10 Things I Hate About pandas"](http://wesmckinney.com/blog/apache-arrow-pandas-internals/) by Wes McKinney
> When you write df[df.c < 0].d.sum(), pandas creates a temporary DataFrame df[df.c < 0] then sums the d column of that temporary object. If df contains a lot of columns, this is ridiculously wasteful.

In [None]:
# this would cost 46+GB RAM using Pandas
# and wouldn't be possible on this laptop
print(ds[ds.tip_amount < 10].tip_amount.mean())

# (Binned) Statistics
Strong focus on statistics (on regular grids)
## 0 dimensional

In [None]:
ds.count()

In [None]:
ds.count(ds.pickup_latitude)

In [None]:
ds.mean(ds.pickup_latitude)

## 1 dimensional

In [None]:
ds.count(binby=ds.pickup_latitude, limits=[40.5, 41])

In [None]:
plt.plot(_)

## 2 dimensional

In [None]:
counts2d = ds.count(binby=[ds.pickup_longitude, ds.pickup_latitude], shape=128)
print(counts2d.shape)

In [None]:
plt.imshow(np.log10(counts2d+1).T, origin='lower')

In [None]:
limits = ds.limits([ds.pickup_longitude, ds.pickup_latitude], "98%")
limits

In [None]:
%%time
ds.plot(ds.pickup_longitude, ds.pickup_latitude, f="log1p",
        limits=limits, figsize=(10,8), shape=512, colormap="viridis")

In [None]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude,
        z='trip_distance:0,30,3',
        what=[vaex.stat.count(), vaex.stat.mean(ds.tip_amount)],
        visual=dict(column="z", row='what'),
        f="log1p",
        limits=limits, figsize=(16,10), shape=512, colormap="viridis")

In [None]:
# %%timeit
# counts2d = ds.count(binby=["pickup_longitude", "pickup_latitude"], shape=128, limits=limits)#, limits=[[-90, 90], [-180, 180]])

## Where to pick up customers?

In [None]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude, what=vaex.stat.mean(ds.total_amount),
        vmin=0, vmax=50, shape=512, figsize=(10,8), limits=limits, colormap="Greys")

In [None]:
ds.trip_distance.minmax()

In [None]:
ds.plot1d(ds.trip_distance, limits=[0, 50])

In [None]:
# no memory copy! and not wasting 46 GB of memory
ds_clean = ds[(ds.trip_distance > 0) & (ds.trip_distance < 40)]

In [None]:
ds_clean.total_amount/ds_clean.trip_distance

In [None]:
ds_clean['ratio'] = ds_clean.total_amount / ds_clean.trip_distance
ds_clean.ratio

In [None]:
ds_clean.plot(ds_clean.pickup_longitude, ds_clean.pickup_latitude,
         what=vaex.stat.mean(ds_clean.ratio),
         vmin=0, vmax=15,
         shape=512, figsize=(10,8), limits=limits, colormap="Greys")

# Interactive

In [None]:
import vaex
import pylab as plt
ds = vaex.open("/Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5")
limits = ds.limits([ds.pickup_longitude, ds.pickup_latitude], "98%")

In [None]:
ds.plot_widget(ds.pickup_longitude, ds.pickup_latitude, selection=[None, 'JFK', 'LaG'],
         controls_selection=True, f='log1p',
         shape=512, figsize=(10,8), limits=limits, colormap="afmhot")#, backend='ipyleaflet')

In [None]:
ds.selection_histories

In [None]:
ds.col.pickup_hour

In [None]:
limits = [1, 20]
plt.figure(figsize=(10,7))
ds.plot1d(ds.tip_amount, selection='LaG', n=True, limits=limits)
ds.plot1d(ds.tip_amount, selection='JFK', n=True, limits=limits)
plt.legend()

# Heavy calculations

In [None]:
def arc_distance(theta_1, phi_1, theta_2, phi_2):
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return distance * 6400

In [None]:
# distance Groningen - Utrecht
arc_distance(53.1739086, 6.5990374, 52.0842715, 5.0124523)

In [None]:
arc_distance_expression = arc_distance(ds.pickup_longitude, ds.pickup_latitude, ds.dropoff_longitude, ds.dropoff_latitude)

In [None]:
arc_distance_expression

In [None]:
ds['arc_distance'] = arc_distance(ds.pickup_longitude, ds.pickup_latitude, ds.dropoff_longitude, ds.dropoff_latitude)

In [None]:
ds

In [None]:
%%time
ds.sum(ds.total_amount)

In [None]:
%%time
ds.sum(ds.arc_distance)

In [None]:
ds.arc_distance

In [None]:
ds['arc_distance_jit'] = ds.arc_distance.jit_numba()

In [None]:
%%time
ds.sum(ds.arc_distance_jit)

In [None]:
# if you got RAM to waste (I don't on this machine)
# TODO: optimize materialize
# ds.materialize(ds.arc_distance, inplace=True)

In [None]:
# %%timeit -n1 -r3
# ds.sum(ds.arc_distance)

# Remote datasets

In [None]:
import vaex
import numpy as np

In [None]:
ds_remote = vaex.open('ws://gaia:9005/nyc_taxi2015')

In [None]:
ds_remote

In [None]:
ds_remote.total_amount / 100

In [None]:
def arc_distance(theta_1, phi_1, theta_2, phi_2):
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return distance * 6400

In [None]:
ds_remote['arc_distance'] = arc_distance(ds_remote.pickup_longitude,
                                         ds_remote.pickup_latitude,
                                         ds_remote.dropoff_longitude,
                                         ds_remote.dropoff_latitude)

In [None]:
ds_remote.arc_distance

In [None]:
ds_remote['arc_distance_jit'] = ds_remote.arc_distance.jit_numba()

In [None]:
ds_remote.arc_distance_jit

In [None]:
%%time
ds_remote.mean('arc_distance', progress=True)

In [None]:
%%time
ds_remote.mean('arc_distance_jit', progress=True)

In [None]:
ds_remote.plot_widget(ds.pickup_longitude, ds.pickup_latitude, backend='ipyleaflet', f='log', shape=400)