In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns


In [2]:
from molten.data_drift.kdq_tree import KdqTree

In [8]:
# Drift 1: change the mean & var of line item B in 2009, means will revert for 2010 on
# Drift 2: change the variance of line item c and d in 2012 by replacing some with the mean
# keep same mean as other years, revert by 2013
# Drift 3: change the correlation of line item e and f in 2015 (go from correlation of 0 to correlation of 0.5)
# Drift 4: change mean and var of H and persist it from 2018 on
# Drift 5: change mean and var just for a year of J in 2021

#Then, assuming the test batch becomes the new reference batch upon drift
#detection, accurate years for drift detection would be roughly:
drift_years = pd.DataFrame({'tax_yr':[2009, 2010, 2012, 2013, 2015, 2016, 2018, 2021]})
drift_years['drift_true'] = 'drift'

In [4]:
df_orig = pd.read_csv(os.path.join("src", "molten", "tools", "artifacts", "fake_wls_eligibility_FINAL.csv"),
                index_col = 'tin')

In [5]:
df = pd.concat([df_orig, pd.get_dummies(df_orig.fil_stat, prefix='fil_stat')], axis=1)

In [6]:
df.drop(columns=['fil_stat'], inplace=True)

In [23]:
plot_data = {}
np.random.seed(123)
status = pd.DataFrame(columns=['tax_yr', 
                                'kld_crit', 
                                'kld_curr',
                                'drift'])
det = KdqTree(stream=False)
for group, subdf in df.groupby('tax_yr'):
    det.update(subdf.drop(columns=['tax_yr']).values)
    status = status.append(pd.Series({'tax_yr':group, 
                                        'kld_crit':det._critical_dist, 
                                        'kld_curr':kld_curr,
                                        'drift':det.drift_state}), ignore_index=True)
    if det.drift_state is not None:
        #capture the visualization data
        plot_data[group] = det.to_plotly_dataframe()
        #update the reference window if drift is detected
        det.update(subdf.drop(columns=['tax_yr']).values)

In [24]:
#It's only in year 2018 where, for this test data, we don't detect drift
#immediately. It does get picked up in the following year.
status.merge(drift_years, how='left', on='tax_yr')

Unnamed: 0,tax_yr,kld_crit,kld_curr,drift,drift_true
0,2007.0,0.014708,0.514639,,
1,2008.0,0.014708,0.514639,,
2,2009.0,0.014708,0.514639,drift,drift
3,2010.0,0.015772,0.514639,drift,drift
4,2011.0,0.01484,0.514639,,
5,2012.0,0.01484,0.514639,drift,drift
6,2013.0,0.014766,0.514639,drift,drift
7,2014.0,0.01536,0.514639,,
8,2015.0,0.01536,0.514639,drift,drift
9,2016.0,0.015584,0.514639,drift,drift


In [26]:
#If we save off the dataframes at each drift detection, we can subsequently display the KSS:
#right now we're not gracefully including the column names, but in this case the map is simple:
#ax 0 - line_item_a
#ax 1 - line_item_b
#ax 2 - line_item_c
#ax 3 - line_item_d
#ax 4 - line_item_e
#ax 5 - line_item_f
#ax 6 - line_item_g
#ax 7 - line_item_h
#ax 8 - line_item_i
#ax 9 - line_item_j


#We can see that the regions of greatest drift do line up with at least one of
#the line items that were modified in a given year.
#For reference:
    # Drift 1: change the mean & var of line item B in 2009, means will revert for 2010 on
    # Drift 2: change the variance of line item c and d in 2012 by replacing some with the mean
    # keep same mean as other years, revert by 2013
    # Drift 3: change the correlation of line item e and f in 2015 (go from correlation of 0 to correlation of 0.5)
    # Drift 4: change mean and var of H and persist it from 2018 on
    # Drift 5: change mean and var just for a year of J in 2021
import plotly.express as px
for tax_yr, df_plot in plot_data.items():
    fig = px.treemap(data_frame=df_plot, names='name', ids='idx', parents='parent_idx', color='kss', 
                    color_continuous_scale='blues',
                    title=tax_yr)
    fig.update_traces(root_color='lightgrey')
    fig.show()