In [75]:
import rpy2
import rpy2.robjects
from rpy2.robjects.packages import importr

In [76]:
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()

In [77]:
import numpy as np
import pandas as pd
import torch
from Models import LSTM
from sklearn.preprocessing import MinMaxScaler
import math
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import scipy.stats

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [78]:
import sys
import os
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

# Importing local modules (statistical distance measures)
from CVM_Distance import CVM_Dist as Cramer_Von_Mises_Dist
from Anderson_Darling_Distance import Anderson_Darling_Dist
from Kolmogorov_Smirnov_Distance import Kolmogorov_Smirnov_Dist
from KuiperDistance import Kuiper_Dist
from WassersteinDistance import Wasserstein_Dist
from DTS_Distance import DTS_Dist # Combo of Anderson_Darling and CVM distance.

In [79]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(device)

cuda


In [80]:
filepath = "RELIANCE_Stock_2001-2022.csv"
data = pd.read_csv(filepath, usecols=[0,4], names=['date', 'close'], header=0)
data = data.sort_values('date')
data['date'] = pd.to_datetime(data['date'])
data.head()

Unnamed: 0,date,close
0,2001-01-01,341.75
1,2001-01-02,354.3
2,2001-01-03,360.05
3,2001-01-04,357.8
4,2001-01-05,364.3


In [81]:
data.tail()

Unnamed: 0,date,close
5243,2022-01-25,2373.25
5244,2022-01-27,2338.1
5245,2022-01-28,2335.85
5246,2022-01-31,2386.6
5247,2022-02-01,2378.7


In [82]:
data['close'].isnull().values.any()

False

In [85]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['date'], y=data['close'], mode='lines', name='closing price'))

# fig.update_xaxes(range=["2009-11-01", "2021-11-01"])
# fig.update_yaxes(range=[500, 2500])

In [86]:
TsAD = importr('otsad')
TSdist = importr('TSdist')

In [87]:
train_len = 0.8*len(data)

In [93]:
train_len, len(data)

(4198.400000000001, 5248)

In [88]:
res = TsAD.OcpTsSdEwma(data['close'].values, train_len, 0.01, 3, 50 )

In [89]:
df = pd.DataFrame(res)

In [90]:
df.head()

Unnamed: 0,is.anomaly,lcl,ucl
0,0.0,341.75,341.75
1,0.0,354.3,354.3
2,0.0,360.05,360.05
3,0.0,357.8,357.8
4,0.0,364.3,364.3


In [92]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data.index, y=data['close'], mode='lines', name='close price'))
fig.add_trace(go.Scatter(x=df.index, y=df['lcl'], mode='lines', name='lower control limit'))
fig.add_trace(go.Scatter(x=df.index, y=df['ucl'], mode='lines', name='upper control limit'))
# fig.add_trace(go.Scatter(x=df.index[2316:2416], y=preds, mode='lines', name='predictions'))


# fig.update_xaxes(range=["2009-11-01", "2021-11-01"])
# fig.update_yaxes(range=[500, 2500])

In [94]:
anomaly_indices = df[df['is.anomaly'] == 1.0].index

In [95]:
rows = data.iloc[anomaly_indices, :]

In [96]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['date'], y=data['close'], mode='lines', name='lines'))
fig.add_trace(go.Scatter(x=rows['date'], y=rows['close'], mode='markers', name='markers'))

# fig.update_xaxes(range=["2009-11-01", "2021-11-01"])
# fig.update_yaxes(range=[500, 2500])

In [97]:
def splitData(ts):

    test_set_size = int(np.round(0.2*len(ts)))
    train_set = ts[:-test_set_size]    
    test_set = ts[-test_set_size:]

    return train_set, test_set

In [98]:
#Normalize data
scaler = MinMaxScaler(feature_range=(-1, 1))

In [99]:
# Normalize the training, validation and test set

def normalize_data(train_set, test_set):
    train_norm = scaler.fit_transform(train_set.reshape(-1, 1))
    test_norm = scaler.transform(test_set.reshape(-1, 1))
    
    return train_norm, test_norm

In [100]:
window_size = 20
def prepareDataForTraining(seq):

    x_data = []
    y_data = []
    L = len(seq)
    for i in range(L-window_size):        
        window = seq[i:i+window_size]
        label = seq[i+window_size:i+window_size+1]
        x_data.append(window)
        y_data.append(label)
    return x_data, y_data 

In [101]:
train_set, test_set = splitData(data['close'].values)
train_norm, test_norm = normalize_data(train_set, test_set)

x_train, y_train = prepareDataForTraining(train_norm)
x_test, y_test = prepareDataForTraining(test_norm)

x_train = np.asarray(x_train).reshape(-1, window_size, 1)
y_train = np.asarray(y_train).reshape(-1, 1)
x_test = np.asarray(x_test).reshape(-1, window_size, 1)
y_test = np.asarray(y_test).reshape(-1, 1)

print('x_train.shape = ',x_train.shape)
print('y_train.shape = ',y_train.shape)
print('x_test.shape = ',x_test.shape)
print('y_test.shape = ',y_test.shape)

x_train = torch.from_numpy(x_train).type(torch.Tensor)
x_test = torch.from_numpy(x_test).type(torch.Tensor)
y_train_lstm = torch.from_numpy(y_train).type(torch.Tensor)
y_test_lstm = torch.from_numpy(y_test).type(torch.Tensor)

x_train.shape =  (4178, 20, 1)
y_train.shape =  (4178, 1)
x_test.shape =  (1030, 20, 1)
y_test.shape =  (1030, 1)


In [102]:
input_dim = 1
hidden_dim = 32
num_layers = 2
output_dim = 1
num_epochs = 100

model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
criterion = torch.nn.MSELoss(reduction='mean')
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)

hist = np.zeros(num_epochs)
lstm = []

for t in range(num_epochs):
    y_train_pred = model(x_train)

    loss = criterion(y_train_pred, y_train_lstm)
    hist[t] = loss.item()

    optimiser.zero_grad()
    loss.backward()
    optimiser.step() 

In [103]:
torch.save(model, 'lstm_rel_full.pt')

In [105]:
actual_test_len = len(test_set) - window_size

In [106]:
# make predictions
y_test_pred = model(x_test)

# invert predictions
y_test_pred = scaler.inverse_transform(y_test_pred.detach().numpy())
y_test = scaler.inverse_transform(y_test_lstm.detach().numpy())

# calculate root mean squared error
test_error = mean_absolute_percentage_error(y_test[:,0], y_test_pred[:,0])
accuracy_reg = r2_score(y_test[:, 0], y_test_pred[:, 0])
rmse = math.sqrt(mean_squared_error(y_test[:,0], y_test_pred[:,0]))
print('Test Score: %.2f MAPE' % (test_error))
print(f'R-squared value is {accuracy_reg}')
print('RMSE: ', rmse)

preds = y_test_pred[:, 0]
gt = y_test[:, 0]

Test Score: 0.02 MAPE
R-squared value is 0.9907317486338824
RMSE:  48.978239248893736


In [107]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data.index[-actual_test_len:], y=y_test[:,0], mode='lines', name='actual close price'))
fig.add_trace(go.Scatter(x=df.index[-actual_test_len:], y=df['lcl'][-actual_test_len:], mode='lines', name='lower control limit'))
fig.add_trace(go.Scatter(x=df.index[-actual_test_len:], y=df['ucl'][-actual_test_len:], mode='lines', name='upper control limit'))
fig.add_trace(go.Scatter(x=df.index[-actual_test_len:], y= y_test_pred[:,0], mode='lines', name='predicted close price'))


#### Find distance between forecast distributions and actual test values using different distance measures from R

### Dynamic Time Warping

In [118]:
dtw_dist = []
test_set_range = []

for i in range(10):

  X1, X2 = gt[i*100 : (i+1)*100], preds[i*100 : (i+1)*100]

  ## convert to np.array
  X1, X2 = np.asarray(X1), np.asarray(X2)

  dtw = TSdist.DTWDistance(X1, X2)
  dtw_dist.append(dtw.item())
  test_set_range.append(f'{i*100} : {(i+1)*100}')

finalResult = { "range" : test_set_range,    
    "DTW distance" : dtw_dist

    }

df1 = pd.DataFrame(finalResult)

In [119]:
pdc_dist = []
test_set_range = []

for i in range(10):

  X1, X2 = gt[i*100 : (i+1)*100], preds[i*100 : (i+1)*100]

  ## convert to np.array
  X1, X2 = np.asarray(X1), np.asarray(X2)

  pdc = TSdist.PDCDistance(X1, X2)
  pdc_dist.append(pdc.item())
  test_set_range.append(f'{i*100} : {(i+1)*100}')

finalResult = { "range" : test_set_range,    
    "PDC distance" : pdc_dist

    }

df2 = pd.DataFrame(finalResult)

In [120]:
cort_dist = []
test_set_range = []

for i in range(10):

  X1, X2 = gt[i*100 : (i+1)*100], preds[i*100 : (i+1)*100]

  # print(X2.shape)

  ## convert to np.array
  X1 = rpy2.robjects.FloatVector(X1)
  X2 = rpy2.robjects.FloatVector(X2)

  # print(type(X1))

  cort = TSdist.CortDistance(X1, X2, deltamethod="DTW")
  cort_dist.append(cort.item())
  test_set_range.append(f'{i*100} : {(i+1)*100}')

finalResult = { "range" : test_set_range,    
    "Temporal Correlation distance" : cort_dist

    }

df3 = pd.DataFrame(finalResult)

In [121]:
df3

Unnamed: 0,range,Temporal Correlation distance
0,0 : 100,995.478483
1,100 : 200,941.990309
2,200 : 300,1650.581046
3,300 : 400,1229.947207
4,400 : 500,1399.956065
5,500 : 600,2474.946068
6,600 : 700,2892.451997
7,700 : 800,3176.225706
8,800 : 900,2207.940722
9,900 : 1000,3016.374596


In [122]:
fourier_dist = []
test_set_range = []

for i in range(10):

  X1, X2 = gt[i*100 : (i+1)*100], preds[i*100 : (i+1)*100]

  ## convert to np.array
  X1 = rpy2.robjects.FloatVector(X1)
  X2 = rpy2.robjects.FloatVector(X2)

  fourier = TSdist.FourierDistance(X1, X2)
  fourier_dist.append(fourier.item())
  test_set_range.append(f'{i*100} : {(i+1)*100}')

finalResult = { "range" : test_set_range,    
    "Fourier distance" : fourier_dist

    }

df4 = pd.DataFrame(finalResult)

In [123]:
df4

Unnamed: 0,range,Fourier distance
0,0 : 100,1265.573011
1,100 : 200,1980.383992
2,200 : 300,2684.347662
3,300 : 400,2112.306362
4,400 : 500,2606.763478
5,500 : 600,4590.424514
6,600 : 700,5282.490133
7,700 : 800,4151.87872
8,800 : 900,3861.416433
9,900 : 1000,4665.760022


In [124]:
result = pd.concat([df1, df2['PDC distance'], df3['Temporal Correlation distance'],
 df4['Fourier distance']], axis=1)
result

Unnamed: 0,range,DTW distance,PDC distance,Temporal Correlation distance,Fourier distance
0,0 : 100,868.336365,0.86718,995.478483,1265.573011
1,100 : 200,1052.931396,0.731083,941.990309,1980.383992
2,200 : 300,1587.450439,0.663065,1650.581046,2684.347662
3,300 : 400,1237.665649,0.56879,1229.947207,2112.306362
4,400 : 500,1450.137085,1.058688,1399.956065,2606.763478
5,500 : 600,2706.780823,0.909419,2474.946068,4590.424514
6,600 : 700,2928.523315,0.818757,2892.451997,5282.490133
7,700 : 800,2835.504272,0.429578,3176.225706,4151.87872
8,800 : 900,2255.562988,0.923185,2207.940722,3861.416433
9,900 : 1000,2999.189819,1.01728,3016.374596,4665.760022


In [125]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data.index[-len(x_test):], y=data['close'][-len(x_test):], mode='lines', name='closing price'))


In [127]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index[:100], y=y_test[:100,0], mode='lines', name='actual close price'))
fig.add_trace(go.Scatter(x=df.index[:100], y= y_test_pred[:100,0], mode='lines', name='predicted close price'))


In [128]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index[100:200], y=y_test[100:200,0], mode='lines', name='actual close price'))
fig.add_trace(go.Scatter(x=df.index[100:200], y= y_test_pred[100:200,0], mode='lines', name='predicted close price'))


In [129]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index[200:300], y=y_test[200:300,0], mode='lines', name='actual close price'))
fig.add_trace(go.Scatter(x=df.index[200:300], y= y_test_pred[200:300,0], mode='lines', name='predicted close price'))


In [130]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index[600:700], y=y_test[600:700,0], mode='lines', name='actual close price'))
fig.add_trace(go.Scatter(x=df.index[600:700], y= y_test_pred[600:700,0], mode='lines', name='predicted close price'))

In [131]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index[900:1000], y=y_test[900:1000,0], mode='lines', name='actual close price'))
fig.add_trace(go.Scatter(x=df.index[900:1000], y= y_test_pred[900:1000,0], mode='lines', name='predicted close price'))

#### Measure how accuracy of model changes with shift in test set

In [132]:
r2_list = []
rmse_list = []
mape_list = []
test_set_range = []

for i in range(10):
    
    X1, X2 = gt[i*100 : (i+1)*100], preds[i*100 : (i+1)*100]

    rmse = math.sqrt(mean_squared_error(X1, X2))
    mape = mean_absolute_percentage_error(X1, X2)
    r2 = r2_score(X1, X2)
    rmse_list.append(rmse)
    mape_list.append(mape)
    r2_list.append(r2)
    
    test_set_range.append(f'{i*100} : {(i+1)*100}')
    

accuracy = { "range" : test_set_range,             
        "RMSE" : rmse_list,
        "MAPE" : mape_list,
        "R2-score": r2_list
}

acc_df = pd.DataFrame(accuracy)

In [133]:
acc_df

Unnamed: 0,range,RMSE,MAPE,R2-score
0,0 : 100,17.784209,0.014255,0.396737
1,100 : 200,27.608973,0.020303,0.95156
2,200 : 300,37.802614,0.024332,0.687281
3,300 : 400,29.867109,0.016228,0.616618
4,400 : 500,35.73154,0.020465,0.93367
5,500 : 600,64.844582,0.03887,0.879562
6,600 : 700,66.512312,0.025312,0.942499
7,700 : 800,58.119285,0.022645,0.629418
8,800 : 900,52.809664,0.018242,0.726191
9,900 : 1000,64.319135,0.022228,0.877488


In [134]:
r2_list = []
rmse_list = []
mape_list = []
test_set_range = []

for i in range(14):
    
    X1, X2 = gt[i*70 : (i+1)*70], preds[i*70 : (i+1)*70]

    rmse = math.sqrt(mean_squared_error(X1, X2))
    mape = mean_absolute_percentage_error(X1, X2)
    r2 = r2_score(X1, X2)
    rmse_list.append(rmse)
    mape_list.append(mape)
    r2_list.append(r2)
    
    test_set_range.append(f'{i*70} : {(i+1)*70}')
    

accuracy = { "range" : test_set_range,             
        "RMSE" : rmse_list,
        "MAPE" : mape_list,
        "R2-score": r2_list
}

acc_df2 = pd.DataFrame(accuracy)

In [135]:
acc_df2

Unnamed: 0,range,RMSE,MAPE,R2-score
0,0 : 70,18.724458,0.0147,0.028843
1,70 : 140,19.434605,0.016682,0.737512
2,140 : 210,39.555457,0.025011,0.830981
3,210 : 280,31.09533,0.020992,0.316897
4,280 : 350,29.902166,0.018112,0.765649
5,350 : 420,39.679748,0.022144,0.471832
6,420 : 490,30.489852,0.017958,0.93552
7,490 : 560,48.429664,0.023759,0.799334
8,560 : 630,70.083258,0.04431,0.908422
9,630 : 700,71.358429,0.024566,0.797592


In [136]:
r2_list = []
rmse_list = []
mape_list = []
test_set_range = []

for i in range(20):
    
    X1, X2 = gt[i*50 : (i+1)*50], preds[i*50 : (i+1)*50]

    rmse = math.sqrt(mean_squared_error(X1, X2))
    mape = mean_absolute_percentage_error(X1, X2)
    r2 = r2_score(X1, X2)
    rmse_list.append(rmse)
    mape_list.append(mape)
    r2_list.append(r2)
    
    test_set_range.append(f'{i*50} : {(i+1)*50}')
    

accuracy = { "range" : test_set_range,             
        "RMSE" : rmse_list,
        "MAPE" : mape_list,
        "R2-score": r2_list
}

acc_df3 = pd.DataFrame(accuracy)

In [137]:
acc_df3

Unnamed: 0,range,RMSE,MAPE,R2-score
0,0 : 50,19.027527,0.014639,0.099936
1,50 : 100,16.447171,0.01387,0.57991
2,100 : 150,23.761685,0.020226,0.500135
3,150 : 200,30.982139,0.02038,0.755089
4,200 : 250,43.163168,0.028305,0.350066
5,250 : 300,31.543877,0.020359,0.793095
6,300 : 350,28.292427,0.015916,0.740366
7,350 : 400,31.362831,0.016541,-0.022662
8,400 : 450,41.855512,0.02574,0.31722
9,450 : 500,28.312575,0.015189,0.879429


#### ECDF distance measures between actual values and forecasts

In [138]:
wasserstein_distance = []
test_set_range = []

for i in range(10):
    
    X1, X2 = gt[i*100 : (i+1)*100], preds[i*100 : (i+1)*100]

    dist = Wasserstein_Dist(X1, X2)

    wasserstein_distance.append(dist)
    test_set_range.append(f'{i*100} : {(i+1)*100}')



finalResult = { "range" : test_set_range,             
        "Wasserstein distance" : wasserstein_distance

}

df7 = pd.DataFrame(finalResult)

In [141]:
KS_distance = []
test_set_range = []

for i in range(10):
    
    X1, X2 = gt[i*100 : (i+1)*100], preds[i*100 : (i+1)*100]

    dist = Kolmogorov_Smirnov_Dist(X1, X2)

    KS_distance.append(dist)
    test_set_range.append(f'{i*100} : {(i+1)*100}')



finalResult = { "range" : test_set_range,             
        "Kolmogorov Smirnov distance" : KS_distance

}

df8 = pd.DataFrame(finalResult)

In [142]:
DTS_distance = []
test_set_range = []

for i in range(10):
    
    X1, X2 = gt[i*100 : (i+1)*100], preds[i*100 : (i+1)*100]

    dist = DTS_Dist(X1, X2)

    DTS_distance.append(dist)
    test_set_range.append(f'{i*100} : {(i+1)*100}')



finalResult = { "range" : test_set_range,             
        "DTS distance" : DTS_distance

}

df9 = pd.DataFrame(finalResult)

In [143]:
result = pd.concat([df7, df8['Kolmogorov Smirnov distance'], df9['DTS distance']], axis=1)
result

Unnamed: 0,range,Wasserstein distance,Kolmogorov Smirnov distance,DTS distance
0,0 : 100,4.81579,0.18,1.230875
1,100 : 200,9.807397,0.07,1.815544
2,200 : 300,6.790255,0.17,1.662961
3,300 : 400,3.782405,0.07,1.00425
4,400 : 500,14.424774,0.1,3.146175
5,500 : 600,8.185049,0.05,2.654528
6,600 : 700,34.562483,0.18,6.492102
7,700 : 800,18.340879,0.18,4.265903
8,800 : 900,13.981193,0.1,2.873651
9,900 : 1000,18.309072,0.08,3.784563


#### ECDF distance measures between train set and test set

In [144]:
def get_statistical_dist_measures(X_train, X_test):   

    CVM_distance = Cramer_Von_Mises_Dist(X_train, X_test)
    Anderson_Darling_distance = Anderson_Darling_Dist(X_train, X_test)
    Kolmogorov_Smirnov_distance = Kolmogorov_Smirnov_Dist(X_train, X_test)
    Kuiper_distance = Kuiper_Dist(X_train, X_test)
    Wasserstein_distance = Wasserstein_Dist(X_train, X_test)
    DTS_distance = DTS_Dist(X_train, X_test)   
    
    # Returning dictionary, for efficient and fast DataFrame creation. Returns mean for each distance.
    # See https://stackoverflow.com/a/17496530. Fast way to 'append' to dataframe for results table.
    # PRESERVE THE ORDERING
    return {'Anderson_Darling_dist': Anderson_Darling_distance,
            'CVM_dist': CVM_distance,
            'DTS_dist':DTS_distance,
            'Kolmogorov_Smirnov_dist':Kolmogorov_Smirnov_distance,
            'Kuiper_dist': Kuiper_distance,
            'Wasserstein_dist': Wasserstein_distance}

In [145]:
test_set_range = []
finalResult = []

for i in range(10):
    
    X1, X2 = train_set, gt[i*100 : (i+1)*100]

    distances = get_statistical_dist_measures(X1, X2)

    finalResult.append(distances)
    test_set_range.append(f'{i*100} : {(i+1)*100}')



In [146]:
test_set_range = []
finalResult2 = []

for i in range(14):
    
    X1, X2 = train_set, gt[i*70 : (i+1)*70]

    distances = get_statistical_dist_measures(X1, X2)

    finalResult2.append(distances)
    test_set_range.append(f'{i*70} : {(i+1)*70}')


In [148]:
mape = 100 - acc_df['MAPE'] * 100

In [149]:
df = pd.DataFrame(finalResult)
frames = [acc_df[['RMSE',  'R2-score']], mape, df]
df = pd.concat(frames, axis=1)
df

Unnamed: 0,RMSE,R2-score,MAPE,Anderson_Darling_dist,CVM_dist,DTS_dist,Kolmogorov_Smirnov_dist,Kuiper_dist,Wasserstein_dist
0,17.784209,0.396737,98.574539,31.815929,898.00848,15.681987,0.492144,0.878528,371.7498
1,27.608973,0.95156,97.969688,31.386775,891.627232,15.253051,0.526679,0.69819,363.042065
2,37.802614,0.687281,97.566833,41.465206,1200.032473,17.274949,0.699624,0.873278,425.088033
3,29.867109,0.616618,98.377151,46.42369,1335.756479,21.682741,0.798237,0.94283,546.79226
4,35.73154,0.93367,97.953545,46.585485,1338.170119,21.422672,0.775131,0.903525,539.343603
5,64.844582,0.879562,96.113052,45.96898,1310.935788,22.273972,0.652968,0.778028,555.98827
6,66.512312,0.942499,97.468834,58.91058,1614.72293,40.898851,0.855407,0.892806,978.408773
7,58.119285,0.629418,97.735489,60.171425,1648.284697,44.361693,0.899,0.94021,1069.971412
8,52.809664,0.726191,98.175774,62.221368,1684.484197,47.536905,0.904478,0.950453,1128.341589
9,64.319135,0.877488,97.777161,71.49454,1827.821682,67.05748,0.924488,0.935684,1435.62357


In [150]:
df1 = pd.DataFrame(finalResult2)
df1.insert(0, 'Test set range', test_set_range)
df1

Unnamed: 0,Test set range,Anderson_Darling_dist,CVM_dist,DTS_dist,Kolmogorov_Smirnov_dist,Kuiper_dist,Wasserstein_dist
0,0 : 70,32.116957,906.032369,15.802629,0.502144,0.897577,375.153725
1,70 : 140,29.424392,820.024903,15.299387,0.490472,0.802055,359.090474
2,140 : 210,39.931419,1146.27588,17.823889,0.593214,0.764725,437.364378
3,210 : 280,41.195229,1187.144212,17.517667,0.695338,0.89448,429.802356
4,280 : 350,45.884766,1315.392146,21.500385,0.794188,0.93878,538.954513
5,350 : 420,45.245308,1298.602117,20.835832,0.775131,0.934493,521.218342
6,420 : 490,47.098357,1345.159266,21.995113,0.792758,0.921153,552.173592
7,490 : 560,50.588358,1430.423841,26.241335,0.769203,0.894263,655.312015
8,560 : 630,44.275769,1259.122521,21.231218,0.62297,0.730879,527.597535
9,630 : 700,62.658747,1678.100898,48.843278,0.883992,0.921391,1132.916145


In [151]:
mape2 = 100 - acc_df2['MAPE'] * 100

In [154]:
frames = [acc_df2[['RMSE']], df1]
df_new = pd.concat(frames, axis=1)
df_new


Unnamed: 0,RMSE,Test set range,Anderson_Darling_dist,CVM_dist,DTS_dist,Kolmogorov_Smirnov_dist,Kuiper_dist,Wasserstein_dist
0,18.724458,0 : 70,32.116957,906.032369,15.802629,0.502144,0.897577,375.153725
1,19.434605,70 : 140,29.424392,820.024903,15.299387,0.490472,0.802055,359.090474
2,39.555457,140 : 210,39.931419,1146.27588,17.823889,0.593214,0.764725,437.364378
3,31.09533,210 : 280,41.195229,1187.144212,17.517667,0.695338,0.89448,429.802356
4,29.902166,280 : 350,45.884766,1315.392146,21.500385,0.794188,0.93878,538.954513
5,39.679748,350 : 420,45.245308,1298.602117,20.835832,0.775131,0.934493,521.218342
6,30.489852,420 : 490,47.098357,1345.159266,21.995113,0.792758,0.921153,552.173592
7,48.429664,490 : 560,50.588358,1430.423841,26.241335,0.769203,0.894263,655.312015
8,70.083258,560 : 630,44.275769,1259.122521,21.231218,0.62297,0.730879,527.597535
9,71.358429,630 : 700,62.658747,1678.100898,48.843278,0.883992,0.921391,1132.916145


In [155]:
test_set_range = []
finalResult3 = []

for i in range(20):
    
    X1, X2 = train_set, gt[i*50 : (i+1)*50]

    distances = get_statistical_dist_measures(X1, X2)

    finalResult3.append(distances)
    test_set_range.append(f'{i*50} : {(i+1)*50}')



In [156]:
df3 = pd.DataFrame(finalResult3)
df3.insert(0, 'Test set range', test_set_range)
df3

Unnamed: 0,Test set range,Anderson_Darling_dist,CVM_dist,DTS_dist,Kolmogorov_Smirnov_dist,Kuiper_dist,Wasserstein_dist
0,0 : 50,31.907206,898.068871,15.79104,0.505717,0.897332,374.432844
1,50 : 100,31.249263,876.721858,15.632934,0.490472,0.876856,369.302683
2,100 : 150,30.106693,842.230043,15.581492,0.526679,0.823249,368.104804
3,150 : 200,43.137467,1238.591691,18.99653,0.750357,0.921868,470.053644
4,200 : 250,40.502004,1164.735607,17.219594,0.692711,0.886851,421.037341
5,250 : 300,42.354828,1217.491882,18.11381,0.762982,0.936636,446.191438
6,300 : 350,46.923564,1337.469119,22.431257,0.798237,0.94283,560.951689
7,350 : 400,46.011403,1314.607813,21.729717,0.799905,0.959266,543.308635
8,400 : 450,44.158774,1266.223135,20.023477,0.775131,0.942354,497.904184
9,450 : 500,50.924765,1434.581734,26.445883,0.826346,0.95474,658.370136


In [157]:
mape3 = 100 - acc_df3['MAPE']*100

In [159]:
fig = px.scatter(x=df_new['Wasserstein_dist'], y=mape2, labels=dict(x="Dissimilarity (Wasserstein distance)", y="Accuracy (1-MAPE)"))
fig.update_traces(marker = dict(size=10, color='red'))
fig.update_yaxes(range=[90, 100])
fig.show()

In [161]:
fig = px.scatter(x=df_new['Wasserstein_dist'], y=df_new['RMSE'], labels=dict(x="Dissimilarity (Wasserstein distance)", y="Accuracy (RMSE)"))
fig.update_traces(marker = dict(size=10, color='red'))
fig.update_yaxes(range=[0, 100])
fig.show()

#### Curve fitting using Scipy's curve_fit() function

In [69]:
from scipy.optimize import curve_fit
import math