In [None]:
%matplotlib inline

import sys
import os

# Add root project directory to path if needed, since in notebook directory
if not any([path.endswith('./../') for path in sys.path]):
    sys.path.insert(0, os.path.join(os.getcwd(), './../'))    

import requests
import io
import os
import s3fs
import glob

import dask.dataframe as dd
import pandas as pd
import numpy as np

from dask import delayed
from distributed import Client
from distributed import progress, wait
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from core.data_acquisition import TaxiData

clean_df = TaxiData.clean_df

# S3 File system
fs = s3fs.S3FileSystem(anon=True)

# Connection to the dask scheduler
client = Client(f'tcp://0.0.0.0:8786')
client.restart()
client

In [None]:
!pip install graphviz

In [None]:
import time
import random

@delayed
def add(x, y):
    time.sleep(random.random())
    return x + y

@delayed
def subtract_1(x):
    time.sleep(random.random())
    return x - 1

@delayed
def summer(arr):
    time.sleep(random.random())
    return sum(arr)
    
sub_results = []
for i in range(8):
    
    add_result = add(i, i*2)
    subtract_result = subtract_1(add_result)
    sub_results.append(subtract_result)
    
total = summer(sub_results)
    
total.visualize()

In [None]:
total.compute()

## Read in and Look at Taxi Data...

In [None]:
df = dd.read_csv('s3://dask-data/nyc-taxi/2015/*.csv',
                 storage_options={'anon': True})
df

## Nothing is actually computed, just meta data sampling, lets persist it to cluster RAM

In [None]:
df['tpep_pickup_datetime'] = dd.to_datetime(df.tpep_pickup_datetime, yearfirst=True, errors='coerce')
df['tpep_dropoff_datetime'] = dd.to_datetime(df.tpep_dropoff_datetime, yearfirst=True, errors='coerce')

df = client.persist(df)
progress(df)

In [None]:
df.head()

Summary stats...

In [None]:
count = df.count()
count

In [None]:
count.compute().VendorID

In [None]:
df.total_amount.describe().compute().astype(int)

## How many are below 0 and how many are above the 99% quantile?

In [None]:
print('Count of fares below  0: ', df.where(df.total_amount < 0).VendorID.count().compute())
_99th_quantile = df.total_amount.quantile(q=0.99).compute()
print('99% quantile: ', _99th_quantile)
print('Count of fares over  99%: ', df.where(df.total_amount > _99th_quantile).VendorID.count().compute())

## Let's limit the data to fares between 0 and 70

In [None]:
df = df.where(df.total_amount.between(0, 70))
df.count().compute().VendorID

---
# A little bit of analysis...

## Do more passengers == longer distances?

In [None]:
grouped = df.groupby(df.passenger_count).trip_distance.mean().compute()

plt.figure(figsize=(10, 8))
grouped.plot.barh()
plt.title('Passenger Count vs Mean Distance Traveled')
plt.xlabel('Distance')
plt.show()

### Average speed by hour, maybe?

In [None]:
# Calculate speed of trip
speed = (df.trip_distance / ((df.tpep_dropoff_datetime - df.tpep_pickup_datetime).astype('timedelta64[m]').astype(float) / 60))

# Replace inf values with NaNs
speed = speed.map(lambda val: val if val not in [np.inf, -np.inf] else np.NaN)

# Replace extreme values with NaNs
low = speed.quantile(q=0.01).compute()
high = speed.quantile(q=0.9).compute()
speed = speed.map(lambda val: val if low < val < high and not pd.isnull(val) else np.NaN)

# Assign columns
df['speed'] = speed
df['hour'] = df.tpep_pickup_datetime.dt.hour

df = df.persist()
progress(df)

In [None]:
speed_by_hour = df.groupby('hour').speed.mean().compute()

plt.figure(figsize=(10, 8))
speed_by_hour.plot.line()
plt.xlabel('Hour')
plt.ylabel('Speed')
plt.title('Hour of day and average speed')
plt.grid(True)
plt.show()

## Does faster speed == better tip?

In [None]:


tmp = df.dropna(subset=['speed'])
tmp = tmp.persist()
tmp['speed_rounded'] = tmp.speed.map(lambda val: int(5 * (float(val) / 5)))
tips_by_speed = tmp.groupby('speed_rounded').tip_amount.mean().compute()

plt.figure(figsize=(10, 8))
tips_by_speed.plot.line()
plt.xlabel('Speed')
plt.ylabel('Tip Amount')
plt.title('Speed vs Tip Amount')
plt.grid(True)
plt.show()

## When are the best trip fractions?

In [None]:
df2 = df[(df.tip_amount > 0) & (df.fare_amount > 0)]    # filter out bad rows
df2['tip_fraction'] = df2.tip_amount / df2.fare_amount  # make new column

dayofweek = (df2.groupby(df2.tpep_pickup_datetime.dt.dayofweek)
                .tip_fraction
                .mean()
            ).compute()
hour      = (df2.groupby(df2.tpep_pickup_datetime.dt.hour)
                .tip_fraction
                .mean()
            ).compute()

In [None]:
plt.figure(figsize=(10, 8))
hour.plot.line()
plt.ylabel('Tip fraction of Fare')
plt.xlabel('Hour')
plt.title('Fraction of Tip by Hour')
plt.show()

print('----- By Day of Week -----')

plt.figure(figsize=(10, 8))
ax = dayofweek.plot.line()
ax.set_xticklabels(['', 'Mon', 'Tu', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.show()

## ML with Dask


In [None]:
client.restart()

In [None]:
exclude_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'VendorID', 'tip_amount',
                'RateCodeID', 'store_and_fwd_flag', 'payment_type']

X = dd.read_csv('s3://dask-data/nyc-taxi/2015/*.csv',
                storage_options={'anon': True})

columns = [col for col in X.columns if col not in exclude_cols]
X = X.dropna(subset=[c for c in X.columns if c not in exclude_cols])
y = (X.tip_amount > 10).astype(int)

X = X[columns]

X = X.persist()
y = y.persist()

progress(X)
X.head()

### Randomly split the data between training and testing

In [None]:
xTrain, xTest = X.random_split([0.70, 0.30], random_state=1234)
yTrain, yTest = y.random_split([0.70, 0.30], random_state=1234)

xTrain = xTrain.persist()
xTest  = xTest.persist()
yTrain = yTrain.persist()
yTest  = yTest.persist()
progress(xTrain)

### Train the model and make predictions

In [None]:
import dask.dataframe as dd
import pandas as pd
from dask_ml.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=4)
clf.fit(xTrain.values, yTrain.values)

In [None]:
clf.score(xTest.values, yTest.values).compute()

## InfluxDB

In [None]:
df = dd.read_csv('s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-11.csv',
                 storage_options={'anon': True})
df['tpep_pickup_datetime'] = dd.to_datetime(df.tpep_pickup_datetime, yearfirst=True, errors='coerce')
df['tpep_dropoff_datetime'] = dd.to_datetime(df.tpep_dropoff_datetime, yearfirst=True, errors='coerce')

df = client.persist(df)
progress(df)

## Data processing script

In [None]:
!pip install influxdb

In [None]:
from influxdb import InfluxDBClient

client = InfluxDBClient('localhost', 8086, 'admin', 'admin', 'admin')

points = []
batch_size = 1000

chunk = df.loc[:, ['tpep_dropoff_datetime', 'trip_distance']]

for idx, row in chunk.iterrows():
    
    point = {
        "measurement": "distance",
        "tags": {
            "year": 2015,
            "month": 11
        },
        "time": row['tpep_dropoff_datetime'],
        "fields": {
            "value": row['trip_distance']
        }
    }
    
    points.append(point)
    
    if len(points) >= batch_size:
        client.write_points(points)
        points.clear()
    
client.write_points(points)

In [None]:
import sys
counts = []

# Loop through years 2009-2017
for year in np.arange(2009, 2018):
    
    # Get only files pertaining to this year
    files = [f for f in fs.ls('nyc-tlc/trip data/') if str(year) in f and 'yellow' in f]
    
    # Process files in parallel. (client is asynchronous)
    for i, file in enumerate(files):
        
        # Extract year and month from filename
        _year, month = file[-11:-4].split('-')
        
        # Process data for current year and month
        df = dd.read_csv('s3://' + file, 
                         dtype='object',
                         error_bad_lines=False,
                         blocksize=int(128e6))
        df = df.map_partitions(clean_df)
        df = client.persist(df)
        
        # Yearly dataframe merging
        main_df = df if not i else main_df.append(df)
    
    # Write year's df to S3
    main_df.to_csv('s3://milesg-taxi-data-east/yellow-{year}-*.csv.gz'.format(year=year), compression='gzip')
    counts.append(main_df.passenger_count.count().compute())
    sys.stdout.write('\rYear: {} - Total {}'.format(_year, sum(counts)))
    
    # Clear from cluster memory
    client.cancel(main_df)


In [None]:
client.restart()

In [None]:
client.shutdown()