```sh
conda create -n rapids-0.19 -c rapidsai -c nvidia -c conda-forge \
    cudf=0.19 python=3.8 cudatoolkit=11.2
```

In [1]:
import cudf
import pandas as pd
import numpy as np
import math
import json

In [2]:
cpu_types = {
    'TRIP_ID': np.uint64,
    'CALL_TYPE':     str,
    'ORIGIN_CALL':   np.uint32,
    'ORIGIN_STAND':  np.uint32,
    'TAXI_ID':       np.uint64,
    'TIMESTAMP':     np.uint64,
    'DAY_TYPE':      str,
    'MISSING_DATA':  bool,
    'POLYLINE':      str,
}

In [3]:
train_df = cudf.read_csv("../data/train.csv", dtype=cpu_types)

In [4]:
train_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 1710670 entries, 0 to 1710669
Data columns (total 9 columns):
 #   Column        Dtype
---  ------        -----
 0   TRIP_ID       uint64
 1   CALL_TYPE     object
 2   ORIGIN_CALL   uint32
 3   ORIGIN_STAND  uint32
 4   TAXI_ID       uint64
 5   TIMESTAMP     uint64
 6   DAY_TYPE      object
 7   MISSING_DATA  bool
 8   POLYLINE      object
dtypes: bool(1), object(3), uint32(2), uint64(3)
memory usage: 1.8+ GB


In [5]:
train_df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [6]:
train_df['MISSING_DATA'].value_counts()

False    1710660
True          10
Name: MISSING_DATA, dtype: int32

## Total Rides By Each Taxi

In [7]:
taxi_counts = train_df['TAXI_ID'].value_counts()
taxi_counts

20000080    10746
20000403     9238
20000066     8449
20000364     7821
20000483     7729
            ...  
20000911        2
20000264        2
20000970        1
20000931        1
20000940        1
Name: TAXI_ID, Length: 448, dtype: int32

## Some Stats

In [8]:
print(taxi_counts.mean())
print("Taxis with more than 1000 trips:", (taxi_counts > 1000).sum())

max_trips = taxi_counts.max()
print("Max Trips By A Single Taxi (ID):", max_trips)

3818.4598214285716
Taxis with more than 1000 trips: 429
Max Trips By A Single Taxi (ID): 10746


## Total Rides By Each Taxi; Then Bucketed By Bins of Size 1000

In [9]:
# ubound = int(math.ceil(max_trips / 1000))
# taxi_counts.groupby(pd.cut(taxi_counts, bins=list(np.arange(ubound) * 1000))).size()

In [10]:
head = train_df.head()

In [25]:
for s in head["POLYLINE"].to_array():
    print(s)
    arr = np.array(json.loads(s))
    # print(arr)
    print(arr.shape)

[[-8.618643,41.141412],[-8.618499,41.141376],[-8.620326,41.14251],[-8.622153,41.143815],[-8.623953,41.144373],[-8.62668,41.144778],[-8.627373,41.144697],[-8.630226,41.14521],[-8.632746,41.14692],[-8.631738,41.148225],[-8.629938,41.150385],[-8.62911,41.151213],[-8.629128,41.15124],[-8.628786,41.152203],[-8.628687,41.152374],[-8.628759,41.152518],[-8.630838,41.15268],[-8.632323,41.153022],[-8.631144,41.154489],[-8.630829,41.154507],[-8.630829,41.154516],[-8.630829,41.154498],[-8.630838,41.154489]]
(23, 2)
[[-8.639847,41.159826],[-8.640351,41.159871],[-8.642196,41.160114],[-8.644455,41.160492],[-8.646921,41.160951],[-8.649999,41.161491],[-8.653167,41.162031],[-8.656434,41.16258],[-8.660178,41.163192],[-8.663112,41.163687],[-8.666235,41.1642],[-8.669169,41.164704],[-8.670852,41.165136],[-8.670942,41.166576],[-8.66961,41.167962],[-8.668098,41.168988],[-8.66664,41.170005],[-8.665767,41.170635],[-8.66574,41.170671]]
(19, 2)
[[-8.612964,41.140359],[-8.613378,41.14035],[-8.614215,41.140278],[-8

AttributeError: 'Series' object has no attribute 'transform'