```sh
conda create -n rapids-0.19 -c rapidsai -c nvidia -c conda-forge \
    cudf=0.19 python=3.8 cudatoolkit=11.2
```

In [1]:
import pandas as pd
import numpy as np
import math
import json

In [2]:
cpu_types = {
    'TRIP_ID': np.uint64,
    'CALL_TYPE':     str,
    'ORIGIN_CALL':   np.uint32,
    'ORIGIN_STAND':  np.uint32,
    'TAXI_ID':       np.uint64,
    'TIMESTAMP':     np.uint64,
    'DAY_TYPE':      str,
    'MISSING_DATA':  bool,
    'POLYLINE':      str,
}

panda_types = {
    'TRIP_ID': np.uint64,
    'CALL_TYPE':     str,
    'ORIGIN_CALL':   str,
    'ORIGIN_STAND':  str,
    'TAXI_ID':       np.uint64,
    'TIMESTAMP':     np.uint64,
    'DAY_TYPE':      str,
    'MISSING_DATA':  bool,
    'POLYLINE':      str,
}

In [3]:
train_df = pd.read_csv("../data/train.csv", dtype=panda_types)

# train_df = np.genfromtxt("../data/train.csv", delimiter=',')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1710670 entries, 0 to 1710669
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   TRIP_ID       uint64
 1   CALL_TYPE     object
 2   ORIGIN_CALL   object
 3   ORIGIN_STAND  object
 4   TAXI_ID       uint64
 5   TIMESTAMP     uint64
 6   DAY_TYPE      object
 7   MISSING_DATA  bool  
 8   POLYLINE      object
dtypes: bool(1), object(5), uint64(3)
memory usage: 106.0+ MB


In [5]:
train_df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [6]:
train_df['MISSING_DATA'].value_counts()

False    1710660
True          10
Name: MISSING_DATA, dtype: int64

## Total Rides By Each Taxi

In [7]:
taxi_counts = train_df['TAXI_ID'].value_counts()
taxi_counts

20000080    10746
20000403     9238
20000066     8449
20000364     7821
20000483     7729
            ...  
20000911        2
20000264        2
20000931        1
20000970        1
20000940        1
Name: TAXI_ID, Length: 448, dtype: int64

## Some Stats

In [8]:
print(taxi_counts.mean())
print("Taxis with more than 1000 trips:", (taxi_counts > 1000).sum())

max_trips = taxi_counts.max()
print("Max Trips By A Single Taxi (ID):", max_trips)

3818.4598214285716
Taxis with more than 1000 trips: 429
Max Trips By A Single Taxi (ID): 10746


## Total Rides By Each Taxi; Then Bucketed By Bins of Size 1000

In [9]:
ubound = int(math.ceil(max_trips / 1000))
taxi_counts.groupby(pd.cut(taxi_counts, bins=list(np.arange(ubound) * 1000))).size()

TAXI_ID
(0, 1000]         19
(1000, 2000]      31
(2000, 3000]      95
(3000, 4000]     108
(4000, 5000]      81
(5000, 6000]      77
(6000, 7000]      26
(7000, 8000]       8
(8000, 9000]       1
(9000, 10000]      1
Name: TAXI_ID, dtype: int64

In [10]:
head = train_df.head()

In [11]:
head['POLYLINE'].map(lambda s: np.array(json.loads(s)).shape[0])

0    23
1    19
2    65
3    43
4    29
Name: POLYLINE, dtype: int64

In [12]:
train_df['POLYLINE_shape_0'] = train_df['POLYLINE'].apply(lambda s: np.array(json.loads(s)).shape[0])

In [13]:
shapes = train_df['POLYLINE_shape_0'].to_numpy()

## Trip Polyline Length

In [14]:
for pow in range(4, 16):
    thres = 2 ** pow
    print(thres, (shapes >= thres ).sum())


16 1587085
32 1169847
64 351293
128 46124
256 8926
512 1957
1024 336
2048 43
4096 0
8192 0
16384 0
32768 0


In [15]:
# Prepare Dataset Experiments

from numpy.lib.stride_tricks import sliding_window_view

test = head['POLYLINE'].apply(lambda s: sliding_window_view(np.array(json.loads(s)), window_shape=4, axis=0))


In [16]:
exploded = test.explode()
print(test.iloc[0])
print(test.size)
print(exploded.iloc[0])
print(exploded.iloc[1])
print(exploded.size)

[[[-8.618643 -8.618499 -8.620326 -8.622153]
  [41.141412 41.141376 41.14251  41.143815]]

 [[-8.618499 -8.620326 -8.622153 -8.623953]
  [41.141376 41.14251  41.143815 41.144373]]

 [[-8.620326 -8.622153 -8.623953 -8.62668 ]
  [41.14251  41.143815 41.144373 41.144778]]

 [[-8.622153 -8.623953 -8.62668  -8.627373]
  [41.143815 41.144373 41.144778 41.144697]]

 [[-8.623953 -8.62668  -8.627373 -8.630226]
  [41.144373 41.144778 41.144697 41.14521 ]]

 [[-8.62668  -8.627373 -8.630226 -8.632746]
  [41.144778 41.144697 41.14521  41.14692 ]]

 [[-8.627373 -8.630226 -8.632746 -8.631738]
  [41.144697 41.14521  41.14692  41.148225]]

 [[-8.630226 -8.632746 -8.631738 -8.629938]
  [41.14521  41.14692  41.148225 41.150385]]

 [[-8.632746 -8.631738 -8.629938 -8.62911 ]
  [41.14692  41.148225 41.150385 41.151213]]

 [[-8.631738 -8.629938 -8.62911  -8.629128]
  [41.148225 41.150385 41.151213 41.15124 ]]

 [[-8.629938 -8.62911  -8.629128 -8.628786]
  [41.150385 41.151213 41.15124  41.152203]]

 [[-8.6291

In [17]:
test_list = head['POLYLINE'].apply(lambda s: json.loads(s))

In [18]:
exploded = test_list.explode()
print(test_list.iloc[0])
print(test_list.size)
print(exploded.iloc[0])
print(exploded.iloc[1])
print(exploded.size)

[[-8.618643, 41.141412], [-8.618499, 41.141376], [-8.620326, 41.14251], [-8.622153, 41.143815], [-8.623953, 41.144373], [-8.62668, 41.144778], [-8.627373, 41.144697], [-8.630226, 41.14521], [-8.632746, 41.14692], [-8.631738, 41.148225], [-8.629938, 41.150385], [-8.62911, 41.151213], [-8.629128, 41.15124], [-8.628786, 41.152203], [-8.628687, 41.152374], [-8.628759, 41.152518], [-8.630838, 41.15268], [-8.632323, 41.153022], [-8.631144, 41.154489], [-8.630829, 41.154507], [-8.630829, 41.154516], [-8.630829, 41.154498], [-8.630838, 41.154489]]
5
[-8.618643, 41.141412]
[-8.618499, 41.141376]
179
