# Introduction to hvplot

## A familiar and high-level API for data exploration and visualization

![image](https://hvplot.holoviz.org/assets/diagram.svg)

`.hvplot()` is a powerful and interactive Pandas-like .plot() API

**By replacing .plot() with .hvplot() you get an interactive figure.**

In [None]:
import pandas as pd

In [None]:
# note: this needs to be in its own cell to avoid race conditions
import hvplot
import hvplot.pandas # noqa

hvplot.extension('bokeh')

In [None]:
# delete once we have public read bucket
import gcsfs
import json
token = json.load(open("credentials.json"))
fs = gcsfs.GCSFileSystem(token=token)
storage_options={"token": token}

## Lets read in 1 year of data from parquet into pandas

In [None]:
columns = [
    'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE', 'OP_CARRIER', 
    'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 
    'DEP_TIME', 'DEP_DELAY', 'ARR_TIME', 'ARR_DELAY', 'CANCELLED', 
    'CANCELLATION_CODE', 'DIVERTED', 'AIR_TIME', 'FLIGHTS', 'DISTANCE',
    'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 
    'LATE_AIRCRAFT_DELAY', 'DIV_ARR_DELAY'
]

In [None]:
flights = pd.read_parquet(
        f"gcs://quansight-datasets/airline-ontime-performance/parquet_by_year", 
        filters=[('YEAR', '=', 2022)],
        columns=columns, 
        storage_options=storage_options,
)

In [None]:
print(f"Before : {len(flights)} rows")
# making the dataset smaller
flights = flights[flights.OP_CARRIER.isin(['AA', 'UA', 'WN', 'DL'])]
print(f"After = {len(flights)} rows")

## hvplot as a pandas.plot replacement, easy interactivity

In [None]:
flights.groupby('FL_DATE')["DEP_DELAY"].mean().plot()

In [None]:
flights.groupby('FL_DATE')["DEP_DELAY"].mean().hvplot()

In [None]:
# Exercise
# Add Arrival Delay to the plot
# See if the day of week or day of month makes a difference
# how about looking at the maximum delay or cumulative delay

# solution
flights.groupby('MONTH')[["DEP_DELAY", "ARR_DELAY"]].mean().hvplot()

## Ok so I get zoom, meh.

In [None]:
flight_subset = flights[flights.OP_CARRIER.isin(['AA', 'UA', 'WN', 'DL'])]
flight_subset.hvplot.hist('DEP_DELAY', by='OP_CARRIER', bins=20, bin_range=(-20, 100), width=300, subplots=True)

## Easy Widgets

In [None]:
flights.hvplot.hist('DEP_DELAY', groupby='OP_CARRIER', bins=20, bin_range=(-20, 100), width=300)

In [None]:
import numpy as np

In [None]:
delays = flights.groupby(['DAY_OF_WEEK', 'OP_CARRIER'])['DEP_DELAY'].agg([np.min, np.mean, np.max])

In [None]:
delays.head()

## Compose Plots 

In [None]:
min_max_plot = delays.hvplot.area(x='DAY_OF_WEEK', y='amin', y2='amax', alpha=0.2, groupby="OP_CARRIER")

In [None]:
mean_plot = delays['mean'].hvplot.line(x='DAY_OF_WEEK', groupby="OP_CARRIER")

In [None]:
#
min_max_plot + mean_plot

In [None]:
min_max_plot * mean_plot

## Use hvplot data explorer to get a feel for the data set

In [None]:
# exercise (this needs work)
explorer = hvplot.explorer(flights)
explorer

In [None]:
# save settings from explorer after picking some options
explorer.plot_code()

In [None]:
# look for columns associated with Delays (i.e. "DEL") 
# do this one with the explorer
columns = [col for col in flights.columns if "DEL" in col]
flights.hvplot.violin(y=columns, group_label='Type of Delay', value_label='Delay in Minutes', invert=True, groupby="OP_CARRIER")

In [None]:
# exercise look at cancellations and reasons
flights.groupby('FL_DATE')['CANCELLED'].agg('count', 'sum').hvplot()

In [None]:
# exercise 
flights['DEP_HOUR'] = flights.CRS_DEP_TIME.astype(int) // 100

In [None]:
flights.groupby('DEP_HOUR')['DEP_DELAY'].mean().hvplot.bar()

## geoplot

In [None]:
airports = pd.read_csv('https://raw.githubusercontent.com/ip2location/ip2location-iata-icao/master/iata-icao.csv') # should probably download this

In [None]:
#airports = airports.set_index('iata')

In [None]:
airport_delays = flights.groupby('ORIGIN')['DEP_DELAY'].mean()

In [None]:
airport_delays = pd.merge(airport_delays, airports, left_on='ORIGIN', right_on='iata')

In [None]:
# need to install geoviews for this to work
airport_delays.hvplot.points('Longitude', 'Latitude', geo=True, c='DEP_DELAY', alpha=0.2,
                       xlim=(-180, -30), ylim=(0, 72), tiles='ESRI')

## Panel example later? once we have Dask.