# Homework 6

**Group Member:**  
Phoebe Ling (ycling2)  
Shaojun Zheng (shaojun3)

In [1]:
# import usual things
import pandas as pd
import bqplot
import numpy as np
import ipywidgets
import matplotlib.pyplot as plt

In [2]:
data_url = 'https://raw.githubusercontent.com/UIUC-iSchool-DataViz/is445_bcubcg_fall2022/main/data/licenses_fall2022.csv'

In [3]:
licenses = pd.read_csv(data_url, parse_dates = ['Original Issue Date', 'Effective Date', 'Expiration Date', 'LastModifiedDate', 'Discipline Start Date', 'Discipline End Date'])
licenses.columns

Index(['_id', 'License Type', 'Description', 'License Number',
       'License Status', 'Business', 'Title', 'First Name', 'Middle',
       'Last Name', 'Prefix', 'Suffix', 'Business Name', 'BusinessDBA',
       'Original Issue Date', 'Effective Date', 'Expiration Date', 'City',
       'State', 'Zip', 'County', 'Specialty/Qualifier',
       'Controlled Substance Schedule',
       'Delegated Controlled Substance Schedule', 'Ever Disciplined',
       'LastModifiedDate', 'Case Number', 'Action', 'Discipline Start Date',
       'Discipline End Date', 'Discipline Reason'],
      dtype='object')

In [4]:
licenses['Expiration Date'] = pd.to_datetime(licenses['Expiration Date'],errors='coerce',format='%m/%d/%Y')

In [5]:
licenses['Expiration-Effective'] = licenses['Expiration Date']-licenses['Effective Date']

In [6]:
licenses['day']=licenses['Expiration-Effective'].map(lambda x: x/np.timedelta64(1,'D'))

**use the year of Original Issue Date as year**

In [7]:
licenses['Original Issue Date'] = pd.to_datetime(licenses['Original Issue Date'],errors='coerce',format='%m/%d/%Y')
licenses['Original_Issue_Year'] = licenses['Original Issue Date'].dt.year

**get pivot table from licenses dataset**

In [8]:
def generate_pivot_table_from_type_status(licenses, takeLog = True):
    pitable = pd.pivot_table(licenses, values='day', index=['License Status'], columns=['License Type'], 
                      aggfunc = np.mean, fill_value=0)
    license_status = pitable.index.values.astype('str')
    license_type = pitable.columns.values.astype('str')
    if takeLog:
        pitable[pitable <= 0] = np.nan # set zeros to NaNs
        # then take log
        pitable = np.log10(pitable)
    return pitable, license_status, license_type

In [9]:
pitable, license_status, license_type = generate_pivot_table_from_type_status(licenses)

## histogram

In [10]:
x_scl2 = bqplot.OrdinalScale()
y_scl2 = bqplot.LinearScale()
x_axs2 = bqplot.Axis(label='Year', scale=x_scl2)
y_axs2 = bqplot.Axis(label='Total Duration in sec', scale=y_scl2, orientation='vertical', side='left')

In [11]:
year_group = licenses.groupby("Original_Issue_Year")["day"].median()
years = year_group.index
values = year_group.values
            
hist_median = bqplot.Bars(x=years, y=values, scales={'x':x_scl2, 'y':y_scl2})
fig_right = bqplot.Figure(marks=[hist_median], axes=[x_axs2, y_axs2])

## heat map

In [12]:
myLabel = ipywidgets.Label()
def on_selected(change):
    if len(change['owner'].selected) == 1: # only selecting one bin at a time
        i,j = change['owner'].selected[0]
        v = pitable.iloc[i,j]
        myLabel.value = 'mean day in log10 '+str(v)
        if str(v) == 'nan':     # if there is no data after mask, just shows the original bar plot
            year_group = licenses.groupby("Original_Issue_Year")["day"].median()
        else:
            region_mask = ((licenses['License Type'] == license_type[j]) & (licenses['License Status'] == license_status[i]))
            year_group = licenses[region_mask].groupby("Original_Issue_Year")["day"].median()
        years = year_group.index
        values = year_group.values        
        hist_median.x = years
        hist_median.y = values


col_sc = bqplot.ColorScale(scheme="Blues", min=np.nanmin(pitable), max=np.nanmax(pitable))

x_sc = bqplot.OrdinalScale() # for categorical data
y_sc = bqplot.OrdinalScale()

c_ax = bqplot.ColorAxis(scale=col_sc, orientation='vertical', side='right')

x_ax = bqplot.Axis(scale=x_sc, label='License Type')
y_ax = bqplot.Axis(scale=y_sc, label='License Status', orientation='vertical')


heat_map = bqplot.GridHeatMap(color=pitable, 
                              row = license_status,
                              column=license_type,
                              scales={'color':col_sc, 'row':y_sc, 'column':x_sc},
                             interactions = {'click':'select'},
                             selected_style={'fill':'red'})
heat_map.observe(on_selected,'selected')

fig_left = bqplot.Figure(marks=[heat_map], axes=[c_ax, y_ax, x_ax])

fig_left.layout.min_width='500px'
fig_right.layout.min_width='500px'
figures = ipywidgets.HBox([fig_left,fig_right])
myDashboard = ipywidgets.VBox([myLabel,figures])


In [13]:
myDashboard

VBox(children=(Label(value=''), HBox(children=(Figure(axes=[ColorAxis(orientation='vertical', scale=ColorScaleâ€¦

## Things to think about

We tried a lot of method to fix the x and y ranges. We duplicated the heat map code and made a little change. Then, we found the x and y of original bar plot fixed but the duplicated bar plot wouldn't change when we selected the heat map. However, we couldn't find which changed element cause the x and y staitc.

We can change the color of heat map by changing `scheme` in `col_sc = bqplot.ColorScale(scheme="Blues", min=np.nanmin(pitable), max=np.nanmax(pitable))`.  
The color of bar chart can also be change by adding attribute `colors`, for example `hist_median = bqplot.Bars(x=years, y=values, scales={'x':x_scl2, 'y':y_scl2}, colors=['red'])`

## write-up

We use `pivot_table` to draw a heat map because the x and y are categorical. We also take the log of the mean day; otherwise, it is hard to differentiate the color saturation because some numbers are too large and others are too small. Also, we used groupby to get the median day of each year.

We use if else to deal with the NaN. When there is no data for certain type and status, we made the bar plot present the original data (no type and status were selected).

As an aesthetic aspect, we keep both the heat map and bar plot blue. We also tried to make the categories' names not overlap but failed. We would try to solve the overlap problem if we had more time. We also want to show the number when the mouse hangs over the graph.

## Test during coding

The following code just the test when we tried to get the what we want to show. They are useless and errors.

In [None]:
example = licenses[(licenses['License Type'] == 'DENTAL') & (licenses['License Status'] == 'ACTIVE')].loc[:,['Expiration Date','Effective Date','day','Original_Issue_Year']]
example

In [None]:
licenses[(licenses['License Type'] == 'APPRAISAL') & (licenses['License Status'] == 'EXPIRED')].loc[:,['Expiration Date','Effective Date','day']]

In [None]:
example['exp-eff days'].mean()

In [None]:
licenses['License Type'].unique()

In [None]:
licenses['License Status'].unique()

In [None]:
year_group = licenses[region_mask].groupby("Original_Issue_Year")["day"].describe()
med, med_edges = np.histogram(year_group, bins=len(year_group)+2) 

In [None]:
def plot_bar(i=0,j=0,mask=False):

    x2_scl = bqplot.LinearScale()
    y2_scl = bqplot.LinearScale()
    x2_axs = bqplot.Axis(label='Year', scale=x2_scl)
    y2_axs = bqplot.Axis(label='median number of days', scale=y2_scl, orientation='vertical')
    if mask:
        
        year_group = licenses[region_mask].groupby("Original_Issue_Year")["day"].describe()
    else:
        year_group = licenses.groupby("Original_Issue_Year")["day"].describe()
    hist_median = bqplot.Bars(x=year_group.index, y =year_group['50%'],
                             scales={'x':x2_scl, 'y':y2_scl})
    fig_right = bqplot.Figure(marks=[hist_median], axes=[x2_axs,y2_axs])
    return fig_right

In [None]:
x2_scl = bqplot.LinearScale()
y2_scl = bqplot.LinearScale()
x2_axs = bqplot.Axis(label='Year', scale=x2_scl)
y2_axs = bqplot.Axis(label='median number of days', scale=y2_scl, orientation='vertical')

year_group = licenses.groupby("Original_Issue_Year")["day"].describe()

hist_median = bqplot.Bars(x=year_group.index, y =year_group['50%'],
                         scales={'x':x2_scl, 'y':y2_scl})
fig_right = bqplot.Figure(marks=[hist_median], axes=[x2_axs,y2_axs])
fig_right

In [None]:
year_group = licenses.groupby("Original_Issue_Year")["day"].describe()
day, day_edges = np.histogram(year_group.index, weights=year_group['50%'], bins=10)
day_centers = (day_edges[:-1]+day_edges[1:])/2 # fancy way to get bin centers for our histogram

x2_scl = bqplot.LinearScale()
y2_scl = bqplot.LinearScale()
x2_axs = bqplot.Axis(label='Year', scale=x2_scl)
y2_axs = bqplot.Axis(label='median number of days', scale=y2_scl, orientation='vertical', side='left')

hist_median = bqplot.Bars(x=day_centers, y=day, scales={'x':x2_scl, 'y':y2_scl})
fig_right = bqplot.Figure(marks=[hist_median], axes=[x2_axs, y2_axs])
fig_right