In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from molten.distribution.hdddm import HDDDM
import seaborn as sns


wls = pd.read_csv('/Users/ilindsay/The MITRE Corporation/iMOLTEN - General/Synthetic Data/fake_wls_eligibility.csv', index_col = 'tin')
drift = wls[wls.drift == True]
drift.tax_yr.value_counts()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ilindsay/The MITRE Corporation/iMOLTEN - General/Synthetic Data/fake_wls_eligibility.csv'

Sensitivity analysis: batch size

1. Should reference window be an entire tax year?
2. Should we break first tax year into two reference windows, to allow us to detect drift on first "test" year?
3. Should we do above, and repeat it after every occurrence of drift?


what to look for:
- is epsilon 0 estimate reasonable? 
- based on where there is drift, is the last epsilon value compared with beta reasonable? 
- is there any way to "tweak" these two variables


In [60]:
#Question 1: reference window = entire tax year
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2012 = wls[wls.tax_yr == 2012].iloc[:,1:11]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]

# Detect drift 
hd3m = HDDDM(ref_2012, subsets = 5)
ref = '2012'
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    
   
results_q1 = results 

drift in  2014
drift in  2017
drift in  2019


Beta threshold is small but there are large differences in distances

In [61]:
#Question 2: first reference tax year split into two halves
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2012 = wls[wls.tax_yr == 2012].iloc[:,1:11]
ref_1 = ref_2012.iloc[0:int(len(ref_2012)/2),]
ref_2 = ref_2012.iloc[int(len(ref_2012)/2):,]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]

# Detect drift 
hd3m = HDDDM(ref_1, subsets = 5)
hd3m.update(ref_2)
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    
   
results_q2 = results

drift in  2014
drift in  2017
drift in  2019


In [66]:
#Question 3: every reference tax year split into two halves
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2012 = wls[wls.tax_yr == 2012].iloc[:,1:11]
ref_1 = ref_2012.iloc[0:int(len(ref_2012)/2),]
ref_2 = ref_2012.iloc[int(len(ref_2012)/2):,]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]

# Detect drift 
hd3m = HDDDM(ref_1, subsets = 5)
hd3m.update(ref_2)
ref = '2012'
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        #reset hd3m 
        ref = wls[wls.tax_yr == year].iloc[:,1:11]
        ref_1 = ref.iloc[0:int(len(ref)/2),]
        ref_2 = ref.iloc[int(len(ref)/2):,]
        hd3m = HDDDM(ref_1, subsets = 5)
        hd3m.update(ref_2)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    

    
   
results_q3 = results

drift in  2014
drift in  2015
drift in  2016
drift in  2017
drift in  2018
drift in  2019


### Comparing Results

Q1: reference batch = entire 2012 tax year

Drift (almost) correctly identified. Delay in 2016 is because "difference in distance" is small, since mean fell back to normal in 2015, 2015 also had a high hellinger distance. 

Epsilon 0s are all reasonable, maybe a little on the small side = a smaller beta estimate = potential for more false alarms (but that doesn't happen here)  

In [58]:
results_q1

Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2013.0,2012,1.0,0.057856,[],0.0,
0,2014.0,2012 2013,2.0,0.218888,"[0.012080358293717832, 0.16103189006457924]",0.01208,drift
0,2015.0,2014,1.0,0.216154,[],0.01208,
0,2016.0,2014 2015,2.0,0.210207,"[0.009130730546085356, 0.005947386343807981]",0.009131,
0,2017.0,2014 2015 2016,3.0,0.152527,"[0.005947386343807981, 0.057680215061092666]",0.005888,drift
0,2018.0,2017,1.0,0.196526,[],0.005888,
0,2019.0,2017 2018,2.0,0.133081,"[0.015863509817461974, 0.06344513064684229]",0.015864,drift


Q2: Reference window = half of 2012. First test batch = half of 2012. Everything else normal

The same as above except first epsilon value is more reasonable. Think this is the better method

In [64]:
results_q2

Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2013.0,2019,2.0,0.057856,"[0.01639555760125963, 0.009218490557879197]",0.016396,
0,2014.0,2019 2013,3.0,0.218888,"[0.009218490557879197, 0.16103189006457924]",0.009126,drift
0,2015.0,2014,1.0,0.216154,[],0.009126,
0,2016.0,2014 2015,2.0,0.210207,"[0.009130730546085356, 0.005947386343807981]",0.009131,
0,2017.0,2014 2015 2016,3.0,0.152527,"[0.005947386343807981, 0.057680215061092666]",0.005888,drift
0,2018.0,2017,1.0,0.196526,[],0.005888,
0,2019.0,2017 2018,2.0,0.133081,"[0.015863509817461974, 0.06344513064684229]",0.015864,drift


changing number of subsets using above method. 

subsets = 8-10 results in epsilon 0 estimates being too high -> drift is detected too few times 

In [69]:
#10 subsets
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2012 = wls[wls.tax_yr == 2012].iloc[:,1:11]
ref_1 = ref_2012.iloc[0:int(len(ref_2012)/2),]
ref_2 = ref_2012.iloc[int(len(ref_2012)/2):,]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]

# Detect drift 
hd3m = HDDDM(ref_1, subsets = 10)
hd3m.update(ref_2)
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    
   
results

drift in  2014
drift in  2017


Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2013.0,2017 2018 2019,2.0,0.057856,"[0.4439605471869877, 0.009218490557879197]",0.443961,
0,2014.0,2017 2018 2019 2013,3.0,0.218888,"[0.009218490557879197, 0.16103189006457924]",0.009126,drift
0,2015.0,2014,1.0,0.216154,[],0.009126,
0,2016.0,2014 2015,2.0,0.210207,"[0.253673827173865, 0.005947386343807981]",0.253674,
0,2017.0,2014 2015 2016,3.0,0.152527,"[0.005947386343807981, 0.057680215061092666]",0.005888,drift
0,2018.0,2017,1.0,0.196526,[],0.005888,
0,2019.0,2017 2018,2.0,0.133081,"[0.30555756456038197, 0.06344513064684229]",0.305558,


In [72]:
# 8 subsets
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Detect drift 
hd3m = HDDDM(ref_1, subsets = 8)
hd3m.update(ref_2)
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    
   
results

drift in  2014
drift in  2017


Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2013.0,2017 2018 2019,2.0,0.057856,"[0.16739728058184056, 0.009218490557879197]",0.167397,
0,2014.0,2017 2018 2019 2013,3.0,0.218888,"[0.009218490557879197, 0.16103189006457924]",0.009126,drift
0,2015.0,2014,1.0,0.216154,[],0.009126,
0,2016.0,2014 2015,2.0,0.210207,"[0.11882749254137392, 0.005947386343807981]",0.118827,
0,2017.0,2014 2015 2016,3.0,0.152527,"[0.005947386343807981, 0.057680215061092666]",0.005888,drift
0,2018.0,2017,1.0,0.196526,[],0.005888,
0,2019.0,2017 2018,2.0,0.133081,"[0.10833257747002753, 0.06344513064684229]",0.108333,


In [73]:
# 7 subsets
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Detect drift 
hd3m = HDDDM(ref_1, subsets = 7)
hd3m.update(ref_2)
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    
   
results

drift in  2014
drift in  2017


Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2013.0,2017 2018 2019,2.0,0.057856,"[0.09693025021049007, 0.009218490557879197]",0.09693,
0,2014.0,2017 2018 2019 2013,3.0,0.218888,"[0.009218490557879197, 0.16103189006457924]",0.009126,drift
0,2015.0,2014,1.0,0.216154,[],0.009126,
0,2016.0,2014 2015,2.0,0.210207,"[0.07463163611946855, 0.005947386343807981]",0.074632,
0,2017.0,2014 2015 2016,3.0,0.152527,"[0.005947386343807981, 0.057680215061092666]",0.005888,drift
0,2018.0,2017,1.0,0.196526,[],0.005888,
0,2019.0,2017 2018,2.0,0.133081,"[0.0674283534138869, 0.06344513064684229]",0.067428,


With 7 subsets, drift is missed in 1 year. IMO it is more accurate though

Epsilon 0 has an accurate estimate. 

If you look at Hellinger distnace for 2019, it isn't changing much. Suggesting HDDDM was not accurately detecting drift due to correlations before, but rather was picking up noise. It had too few subsets = false alarms. This decrease in accuracy is actually due to HDDDM not being well suited for drifts in correlations.

Q3: every reference batch (after init and after drift is split into two) 

Detects drift in every year.. 

Epsilon 0 estimate is too small, try increasing number of subsets? 

In [67]:
results_q3

Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2013.0,2012,2.0,0.057856,"[0.01639555760125963, 0.009218490557879197]",0.016396,
0,2014.0,2012 2013,3.0,0.218888,"[0.009218490557879197, 0.16103189006457924]",0.009126,drift
0,2015.0,2014,2.0,0.216154,"[0.028353100009345756, 0.14521800755185882]",0.028353,drift
0,2016.0,2015,2.0,0.122398,"[0.02822926530430605, 0.053373810969019234]",0.028229,drift
0,2017.0,2016,2.0,0.127047,"[0.02660605777625975, 0.07955439149882718]",0.026606,drift
0,2018.0,2017,2.0,0.196526,"[0.03223018645560342, 0.12325942290542971]",0.03223,drift
0,2019.0,2018,2.0,0.195002,"[0.02302316081811001, 0.12465329191793674]",0.023023,drift


10 subsets is too high epsilon 0 estimates

In [74]:
#Subsets = 10
subs = 10
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2012 = wls[wls.tax_yr == 2012].iloc[:,1:11]
ref_1 = ref_2012.iloc[0:int(len(ref_2012)/2),]
ref_2 = ref_2012.iloc[int(len(ref_2012)/2):,]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]


# Detect drift 
hd3m = HDDDM(ref_1, subsets = subs)
hd3m.update(ref_2)
ref = '2012'
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        #reset hd3m 
        ref = wls[wls.tax_yr == year].iloc[:,1:11]
        ref_1 = ref.iloc[0:int(len(ref)/2),]
        ref_2 = ref.iloc[int(len(ref)/2):,]
        hd3m = HDDDM(ref_1, subsets = subs)
        hd3m.update(ref_2)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    

    
   
results

drift in  2014
drift in  2018


Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2013.0,2012,2.0,0.057856,"[0.4439605471869877, 0.009218490557879197]",0.443961,
0,2014.0,2012 2013,3.0,0.218888,"[0.009218490557879197, 0.16103189006457924]",0.009126,drift
0,2015.0,2014,2.0,0.216154,"[0.3474758274901894, 0.14521800755185882]",0.347476,
0,2016.0,2014 2015,3.0,0.210207,"[0.14521800755185882, 0.005947386343807981]",0.143766,
0,2017.0,2014 2015 2016,4.0,0.152527,"[0.14521800755185882, 0.005947386343807981, 0....",0.11881,
0,2018.0,2014 2015 2016 2017,5.0,0.271053,"[0.14521800755185882, 0.005947386343807981, 0....",0.103182,drift
0,2019.0,2018,2.0,0.195002,"[0.3991002288266432, 0.12465329191793674]",0.3991,


In [75]:
#Subsets = 10
subs = 7
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2012 = wls[wls.tax_yr == 2012].iloc[:,1:11]
ref_1 = ref_2012.iloc[0:int(len(ref_2012)/2),]
ref_2 = ref_2012.iloc[int(len(ref_2012)/2):,]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]


# Detect drift 
hd3m = HDDDM(ref_1, subsets = subs)
hd3m.update(ref_2)
ref = '2012'
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        #reset hd3m 
        ref = wls[wls.tax_yr == year].iloc[:,1:11]
        ref_1 = ref.iloc[0:int(len(ref)/2),]
        ref_2 = ref.iloc[int(len(ref)/2):,]
        hd3m = HDDDM(ref_1, subsets = subs)
        hd3m.update(ref_2)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    

    
   
results

drift in  2014
drift in  2015
drift in  2018
drift in  2019


Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2013.0,2012,2.0,0.057856,"[0.09693025021049007, 0.009218490557879197]",0.09693,
0,2014.0,2012 2013,3.0,0.218888,"[0.009218490557879197, 0.16103189006457924]",0.009126,drift
0,2015.0,2014,2.0,0.216154,"[0.06979300059820849, 0.14521800755185882]",0.069793,drift
0,2016.0,2015,2.0,0.122398,"[0.0995543864398395, 0.053373810969019234]",0.099554,
0,2017.0,2015 2016,3.0,0.099372,"[0.053373810969019234, 0.023025901057056042]",0.05284,
0,2018.0,2015 2016 2017,4.0,0.224455,"[0.053373810969019234, 0.023025901057056042, 0...",0.043769,drift
0,2019.0,2018,2.0,0.195002,"[0.0999763669351358, 0.12465329191793674]",0.099976,drift


7 subsets with 3rd approach is most accurate. epsilon 0 estimates were reasonable. Drift was only identified when hellinger distances were high. 

Drift in 2014 -> shifting mean
Drift in 2015 -> mean shifts back to normal 

Drifts in 2016 with shifting stand-deviation didn't result in that much of a difference

Drifts in 2018 correlations was identified
Drifts in 2019 was a shift back from correlations to normal - was identified 

### Next steps.. try prolonging drift and increasing intensity of STDEV shift to see how the third approach does?

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from molten.distribution.hdddm import HDDDM
import seaborn as sns


wls = pd.read_csv('/Users/ilindsay/The MITRE Corporation/iMOLTEN - General/Synthetic Data/fake_wls_eligibility_v2.csv', index_col = 'tin')
drift = wls[wls.drift == True]
drift.tax_yr.value_counts()

2013    20000
2016    20000
2019    20000
Name: tax_yr, dtype: int64

1. First drift in TY2013: A and B change in both mean and variance. Revert to baseline for TY2014 onward
2. Second drift starts in TY2016 and is persistent until the end of the data: H changes mean and variance for those 5 batches
3. Third drift starts in TY2019 and reverts in TY2020: C and D change variance but not mean

In [3]:
#Subsets = 10
#5 subsets, too sensitive, 10 not sensitive enough 

subs = 7
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2011 = wls[wls.tax_yr == 2011].iloc[:,1:11]
ref_1 = ref_2011.iloc[0:int(len(ref_2011)/2),]
ref_2 = ref_2011.iloc[int(len(ref_2011)/2):,]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]


# Detect drift 
hd3m = HDDDM(ref_1, subsets = subs)
hd3m.update(ref_2)
ref = '2011'
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        #reset hd3m 
        ref = wls[wls.tax_yr == year].iloc[:,1:11]
        ref_1 = ref.iloc[0:int(len(ref)/2),]
        ref_2 = ref.iloc[int(len(ref)/2):,]
        hd3m = HDDDM(ref_1, subsets = subs)
        hd3m.update(ref_2)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    

    
   
results

drift in  2013
drift in  2014
drift in  2016
drift in  2019


Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2012.0,2011,2.0,0.059034,"[0.11186704572084419, 0.013796532984821516]",0.111867,
0,2013.0,2011 2012,3.0,0.220859,"[0.013796532984821516, 0.16182474768667743]",0.013659,drift
0,2014.0,2013,2.0,0.221771,"[0.13852847024924175, 0.14963021128714282]",0.138528,drift
0,2015.0,2014,2.0,0.060002,"[0.07998705893632521, 0.011410432824218497]",0.079987,
0,2016.0,2014 2015,3.0,0.192703,"[0.011410432824218497, 0.132700480715028]",0.011296,drift
0,2017.0,2016,2.0,0.057279,"[0.09338329297566658, 0.01663906467724554]",0.093383,
0,2018.0,2016 2017,3.0,0.058211,"[0.01663906467724554, 0.0009325265462119675]",0.016473,
0,2019.0,2016 2017 2018,4.0,0.138951,"[0.01663906467724554, 0.0009325265462119675, 0...",0.013601,drift
0,2020.0,2019,2.0,0.1323,"[0.07848983331768442, 0.06004186740075007]",0.07849,


Best approach! 

Epsilon 0 is a little high in the first case but reasonable in the others. 

IN 2013, means/variance of A and B change. Drift detected
In 2014, means/variance of A and B revert back to normal. Drift detected

in 2016, mean/variance of H drifts. Detected

In 2019, the variance in C and D drift. Detected

In 2020, the variance in C and D shifts back to normal. The hellinger distance picks up on the shift, but epsilon 0 is 0.01 too high to detect that drift. May represent how change in variance is picked up on but harder to detect

### Is there a range of subset # that will work for us? 

In [13]:
#Subsets = 10
#5 subsets, too sensitive, 10 not sensitive enough 

subs = 6
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2011 = wls[wls.tax_yr == 2011].iloc[:,1:11]
ref_1 = ref_2011.iloc[0:int(len(ref_2011)/2),]
ref_2 = ref_2011.iloc[int(len(ref_2011)/2):,]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]


# Detect drift 
hd3m = HDDDM(ref_1, subsets = subs)
hd3m.update(ref_2)
ref = '2011'
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        #reset hd3m 
        ref = wls[wls.tax_yr == year].iloc[:,1:11]
        ref_1 = ref.iloc[0:int(len(ref)/2),]
        ref_2 = ref.iloc[int(len(ref)/2):,]
        hd3m = HDDDM(ref_1, subsets = subs)
        hd3m.update(ref_2)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    

    
   
results

drift in  2013
drift in  2014
drift in  2016
drift in  2019
drift in  2020


Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2012.0,2011,2.0,0.059034,"[0.0615844456014212, 0.013796532984821516]",0.061584,
0,2013.0,2011 2012,3.0,0.220859,"[0.013796532984821516, 0.16182474768667743]",0.013659,drift
0,2014.0,2013,2.0,0.221771,"[0.05409391678727371, 0.14963021128714282]",0.054094,drift
0,2015.0,2014,2.0,0.060002,"[0.06121762265542433, 0.011410432824218497]",0.061218,
0,2016.0,2014 2015,3.0,0.192703,"[0.011410432824218497, 0.132700480715028]",0.011296,drift
0,2017.0,2016,2.0,0.057279,"[0.047973687586512125, 0.01663906467724554]",0.047974,
0,2018.0,2016 2017,3.0,0.058211,"[0.01663906467724554, 0.0009325265462119675]",0.016473,
0,2019.0,2016 2017 2018,4.0,0.138951,"[0.01663906467724554, 0.0009325265462119675, 0...",0.013601,drift
0,2020.0,2019,2.0,0.1323,"[0.05051424940369185, 0.06004186740075007]",0.050514,drift


### Final conclusion: 

- break every reference dataset into two datasets -> allows us to detect drift on 1st test batch
- epsilon 0 estimates are good enough and beta thresholds are realistic 
- use 6 subsets (5 works but 6 is more robust)
- best for detecting drifts in means (= larger hellinger distances) but still can catch drifts in standard deviation
- works with categorical variables


recommend general rule of thumb in documentation: if data is 20,000 10 subsets is good, if doing this split method where data is 10,000 then closer to 5 subsets is good. 

In [14]:
#Subsets = 10
#5 subsets, too sensitive, 10 not sensitive enough 

subs = 8
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2011 = wls[wls.tax_yr == 2011].iloc[:,1:11]
ref_1 = ref_2011.iloc[0:int(len(ref_2011)/2),]
ref_2 = ref_2011.iloc[int(len(ref_2011)/2):,]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]


# Detect drift 
hd3m = HDDDM(ref_1, subsets = subs)
hd3m.update(ref_2)
ref = '2011'
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        #reset hd3m 
        ref = wls[wls.tax_yr == year].iloc[:,1:11]
        ref_1 = ref.iloc[0:int(len(ref)/2),]
        ref_2 = ref.iloc[int(len(ref)/2):,]
        hd3m = HDDDM(ref_1, subsets = subs)
        hd3m.update(ref_2)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    

    
   
results

drift in  2013


Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2012.0,2011,2.0,0.059034,"[0.15391547682548373, 0.013796532984821516]",0.153915,
0,2013.0,2011 2012,3.0,0.220859,"[0.013796532984821516, 0.16182474768667743]",0.013659,drift
0,2014.0,2013,2.0,0.221771,"[0.20207080922936257, 0.14963021128714282]",0.202071,
0,2015.0,2013 2014,3.0,0.144121,"[0.14963021128714282, 0.07764979949498851]",0.148134,
0,2016.0,2013 2014 2015,4.0,0.255738,"[0.14963021128714282, 0.07764979949498851, 0.1...",0.124038,
0,2017.0,2013 2014 2015 2016,5.0,0.205296,"[0.14963021128714282, 0.07764979949498851, 0.1...",0.119324,
0,2018.0,2013 2014 2015 2016 2017,6.0,0.187502,"[0.14963021128714282, 0.07764979949498851, 0.1...",0.110761,
0,2019.0,2013 2014 2015 2016 2017 2018,7.0,0.251835,"[0.14963021128714282, 0.07764979949498851, 0.1...",0.102898,
0,2020.0,2013 2014 2015 2016 2017 2018 2019,8.0,0.178532,"[0.14963021128714282, 0.07764979949498851, 0.1...",0.097491,


In [15]:
#Subsets = 10
#5 subsets, too sensitive, 10 not sensitive enough 

subs = 5
results = pd.DataFrame(columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'], index= ['year'])

# Setup reference window
ref_2011 = wls[wls.tax_yr == 2011].iloc[:,1:11]
ref_1 = ref_2011.iloc[0:int(len(ref_2011)/2),]
ref_2 = ref_2011.iloc[int(len(ref_2011)/2):,]
test_batches = {}
for year in wls.tax_yr.value_counts().index[1:]:
    data = wls[wls.tax_yr == year]
    test_batches[year] = data.iloc[:,1:11]


# Detect drift 
hd3m = HDDDM(ref_1, subsets = subs)
hd3m.update(ref_2)
ref = '2011'
for year in test_batches:
    hd3m.update(test_batches[year])
    new_data = pd.DataFrame([[year,ref, hd3m.samples_since_reset, hd3m.current_distance,str(hd3m.epsilon), hd3m.beta, hd3m.drift_state ]] , columns = ['year', 'reference','batch number','H distance', 'Epsilon','Beta threshold','Drift'])
    results = results.append(new_data)
    if hd3m.drift_state == 'drift':
        print("drift in ", year)
        #reset hd3m 
        ref = wls[wls.tax_yr == year].iloc[:,1:11]
        ref_1 = ref.iloc[0:int(len(ref)/2),]
        ref_2 = ref.iloc[int(len(ref)/2):,]
        hd3m = HDDDM(ref_1, subsets = subs)
        hd3m.update(ref_2)
        ref = str(year)
    else:
        ref = ref + ' ' + str(year)

    

    
   
results

drift in  2013
drift in  2014
drift in  2016
drift in  2019
drift in  2020


Unnamed: 0,year,reference,batch number,H distance,Epsilon,Beta threshold,Drift
year,,,,,,,
0,2012.0,2011,2.0,0.059034,"[0.03273574063274849, 0.013796532984821516]",0.032736,
0,2013.0,2011 2012,3.0,0.220859,"[0.013796532984821516, 0.16182474768667743]",0.013659,drift
0,2014.0,2013,2.0,0.221771,"[0.026332658455691594, 0.14963021128714282]",0.026333,drift
0,2015.0,2014,2.0,0.060002,"[0.030360673030460715, 0.011410432824218497]",0.030361,
0,2016.0,2014 2015,3.0,0.192703,"[0.011410432824218497, 0.132700480715028]",0.011296,drift
0,2017.0,2016,2.0,0.057279,"[0.022158607425822725, 0.01663906467724554]",0.022159,
0,2018.0,2016 2017,3.0,0.058211,"[0.01663906467724554, 0.0009325265462119675]",0.016473,
0,2019.0,2016 2017 2018,4.0,0.138951,"[0.01663906467724554, 0.0009325265462119675, 0...",0.013601,drift
0,2020.0,2019,2.0,0.1323,"[0.02505382939285577, 0.06004186740075007]",0.025054,drift
