In [None]:
import numpy as np

# options
#########################
lengths = np.arange(start=1670, stop=13361, step=1670)

global_path = "error/"

output_mse = global_path + "mse/MSE_"
output_rmse = global_path + "rmse/RMSE_"
output_mae = global_path + "mae/MAE_"
output_cor = global_path + "correlation.dat"
input_missingmat = "recovery/values/recovered_matrices/recoveredMat"

list_algos = ["stmvl", "cdrec_k3", "cdrec_k2", "tkcm", "spirit", "tenmf", "grouse", "svt", "softimp", "rosl", "dynammo", "svdimp"]

algos_str = "\t".join(list_algos)

TITLEBAR = ("=====================================================\n"
            " #  \t||  ref\t\t" + algos_str + "\n"
            "=====================================================\n")

SEPARATE = "=====================================================\n"


def msqe():
    for length in lengths:
        dftest = np.loadtxt(input_missingmat + str(length) + ".txt", dtype=float)
        print(dftest.shape)

        for i in range(1, dftest.shape[1]):
            fileName = output_mse + list_algos[i - 1] + ".dat"
            with open(fileName, "w") as file:
                file.write("#" + list_algos[i - 1] + "\n")

        for length in lengths:
            df = np.loadtxt(input_missingmat + str(length) + ".txt", dtype=float)
            df = df[~np.isnan(df).any(axis=1)]
            print(input_missingmat + str(length) + ".txt")
            ref = df[:, 0]        
            #print(ref)

            for i in range(1, df.shape[1]):
                fileName = output_mse + list_algos[i - 1] + ".dat"
                comp = df[:, i]
                #print(comp)
                comp -= ref
                msqe_val = np.mean(comp ** 2)
                # if msqe_val > 1E10:
                #     msqe_val = 30.0
                # elif msqe_val > 25.0:
                #     msqe_val = 25.0
                lin = str(length) + " " + str(msqe_val) + "\n"
                with open(fileName, "a") as file:
                    file.write(lin)


msqe()


In [1]:
import numpy as np
import os
import re
from collections import defaultdict

In [2]:
workdir = !pwd
workdir = workdir[0]
workdir

'/mnt/c/Arbeid/Github_Repo/NLDL_report/bench-vldb20/TestingFramework/bin/Debug/Results'

In [3]:
scenarios = ['blackout', 'mcar', 'miss_disj', 'miss_over', 'miss_perc', 'ts_length', 'ts_nbr']
dataset = 'oceantidepressure'

In [7]:
def config_error_calc(steps_arrange, list_algs, sce_dir):
    lengths = np.arange(start=steps_arrange[0], stop=steps_arrange[1], step=steps_arrange[2])
    global_path = sce_dir + '/error/'

    output_mse = global_path + "mse/MSE_"
    output_mae = global_path + "mae/MAE_"
    output_cor = global_path + "correlation.dat"
    input_missingmat = sce_dir + '/' + "recovery/values/recovered_matrices/recoveredMat"

    list_algos = list_algs
    
    return lengths, output_mse, output_rmse, output_mae, output_cor, input_missingmat, list_algos


def mse(lengths, output_mse, output_rmse, output_mae, output_cor, input_missingmat, list_algos):
    err = defaultdict(list)
    for length in lengths:
        dftest = np.loadtxt(input_missingmat + str(length) + ".txt", dtype=float)

        for length in lengths:
            df = np.loadtxt(input_missingmat + str(length) + ".txt", dtype=float)
            df = df[~np.isnan(df).any(axis=1)]
            ref = df[:, 0]        

            for i in range(1, df.shape[1]):
                #fileName = output_mse + list_algos[i - 1] + ".dat"
                comp = df[:, i]
                comp -= ref
                msqe_val = np.mean(comp ** 2)
                err[list_algos[i-1]].append(msqe_val)
    
    for alg in err.keys():
        errors = err[alg]
        print('MSE:', alg, ':', sum(errors) / len(errors))
        
def mae(lengths, output_mse, output_rmse, output_mae, output_cor, input_missingmat, list_algos):
    err = defaultdict(list)
    for length in lengths:
        dftest = np.loadtxt(input_missingmat + str(length) + ".txt", dtype=float)

        for length in lengths:
            df = np.loadtxt(input_missingmat + str(length) + ".txt", dtype=float)
            df = df[~np.isnan(df).any(axis=1)]
            ref = df[:, 0]        

            for i in range(1, df.shape[1]):
                #fileName = output_mse + list_algos[i - 1] + ".dat"
                comp = abs(df[:, i] - ref)
                mae_val = np.mean(comp)
                err[list_algos[i-1]].append(mae_val)
    
    for alg in err.keys():
        errors = err[alg]
        print('MAE:', alg, ':', sum(errors) / len(errors))



In [8]:
for scenario in scenarios:
    try:
        #print('------------------------------')
        #print(scenario)
        error_calculation_r_filepath = workdir + '/' + scenario + '/' + dataset + '/scripts/precision/error_calculation.r'
        with open(error_calculation_r_filepath, 'r') as r_file:
            for _, line in enumerate(r_file, 1):
                if 'lengths <- seq.int' in line:
                    steps_arange = re.findall(r'\d+', line)
                    steps_arange = [int(step_arange) for step_arange in steps_arange]
                elif 'list_algos <- c' in line:
                    list_algs = re.findall(r'"([^"]*)"', line)

        lengths, output_mse, output_rmse, output_mae, output_cor, input_missingmat, list_algos = \
            config_error_calc(steps_arange, list_algs, workdir + '/' + scenario + '/' + dataset )
        mse(lengths, output_mse, output_rmse, output_mae, output_cor, input_missingmat, list_algos)
        mae(lengths, output_mse, output_rmse, output_mae, output_cor, input_missingmat, list_algos)
    except:
        pass
    #break

MSE: brits : 0.26543467231341183
MSE: m-rnn : 0.30186371840828063
MSE: csdipy : 0.6916478820820502
MAE: brits : 0.2601759538859269
MAE: m-rnn : 0.4090353522252748
MAE: csdipy : 0.7870813573851123


In [20]:
import re

# Define the line
line = 'list_algos <- c("stmvl","cdrec_k3","cdrec_k2","tkcm","spirit","tenmf","grouse","svt","softimp","rosl","dynammo","svdimp");'

# Use regular expression to find strings within double quotes
matches = re.findall(r'"([^"]*)"', line)

# Print the extracted elements
print(matches)

['stmvl', 'cdrec_k3', 'cdrec_k2', 'tkcm', 'spirit', 'tenmf', 'grouse', 'svt', 'softimp', 'rosl', 'dynammo', 'svdimp']


In [6]:
for scenario in scenarios:
    print('------------------------------')
    print(scenario)
    if scenario == 'ts_length':
        recovery_values_path = workdir + '/' + scenario + '/' + dataset + '/recovery/values/'
        test_case = [int(case) for case in os.listdir(recovery_values_path) if case.isdigit()]
        test_case = [max(test_case)]
        print(test_case)

------------------------------
blackout
------------------------------
mcar
------------------------------
miss_disj
------------------------------
miss_over
------------------------------
miss_perc
------------------------------
ts_length
[16700]
------------------------------
ts_nbr


In [1]:
import pandas as pd
import os
import copy
import numpy as np
import math

workdir = !pwd
workdir = workdir[0]
scenarios = ['blackout', 'mcar', 'miss_disj', 'miss_over', 'miss_perc', 'ts_length', 'ts_nbr']
dataset = 'oceantidepressuremissing'

for scenario in scenarios:
    print('------------------------------')
    print(scenario)
    recovery_values_path = workdir + '/' + scenario + '/' + dataset + '/recovery/values/'

    # read reference data
    reference_file = workdir + '/' + scenario + '/' + dataset + '/recovery/values/' + 'reference.txt'
    with open(reference_file, 'r') as file:
        content = file.read()
        content = content.replace('\t', ' ')

    with open(reference_file, 'w') as file:
        file.write(content)

    reference = pd.read_csv(reference_file, delimiter=' ', header=None, index_col=0)
    mean_reference = reference.mean(axis=1)
    mean_reference = mean_reference.values.tolist()
    
    recovery_values_path = workdir + '/' + scenario + '/' + dataset + '/recovery/values/'
    test_case = [int(case) for case in os.listdir(recovery_values_path) if case.isdigit()]
    
    if scenario == 'ts_length':
        test_case = [max(test_case)]
        
    # traverse through test case results
    for test_case in os.listdir(recovery_values_path):
        if not test_case.isdigit():
            continue

        test_case_path = recovery_values_path + str(test_case)
        for alg in os.listdir(test_case_path):
            if not alg.endswith('.txt'):
                continue

            result_file = test_case_path + '/' + alg
            with open(result_file, 'r') as file:
                content = file.read()
                content = content.replace('\t', ' ')

            with open(result_file, 'w') as file:
                file.write(content)

            result = pd.read_csv(result_file, delimiter=' ', header=None, index_col=0)
            mean_imputed = result.mean(axis=1)
            mean_imputed = mean_imputed.values.tolist()

            mean_reference_ = copy.deepcopy(mean_reference)
            mean_reference_ = mean_reference_[:len(mean_imputed)]

            squared_errors = [(mean_imputed[i] - mean_reference_[i]) ** 2 for i in range(len(mean_imputed)) \
                              if not math.isnan(mean_imputed[i]) and not math.isnan(mean_reference_[i])]
            mse = sum(squared_errors) / len(squared_errors)

            print(scenario, '\t', test_case,'\t', mse, '\t',  alg,'\t'  )#, sum(squared_errors), len(squared_errors))
                     

------------------------------
blackout
blackout 	 10 	 0.0003096667856458691 	 dynammo10.txt 	
blackout 	 100 	 0.0014541309050718164 	 dynammo100.txt 	
blackout 	 20 	 0.0005150879211705993 	 dynammo20.txt 	
blackout 	 30 	 0.0006681200904464471 	 dynammo30.txt 	
blackout 	 40 	 0.0009670445696967735 	 dynammo40.txt 	
blackout 	 50 	 0.0010350643724698352 	 dynammo50.txt 	
blackout 	 60 	 0.0013290504877981704 	 dynammo60.txt 	
blackout 	 70 	 0.0014679286136887943 	 dynammo70.txt 	
blackout 	 80 	 0.0014541309050718164 	 dynammo80.txt 	
blackout 	 90 	 0.0014541309050718164 	 dynammo90.txt 	
------------------------------
mcar
mcar 	 10 	 0.3745424887967409 	 cdrec10_k2.txt 	
mcar 	 10 	 1.970942538811726e-05 	 cdrec10_k3.txt 	
mcar 	 10 	 5.706647689178418e-33 	 dynammo10.txt 	
mcar 	 10 	 6.801476840833738e+300 	 grouse10.txt 	
mcar 	 10 	 0.3882711278805839 	 rosl10.txt 	
mcar 	 10 	 0.3692231772030024 	 softimp10.txt 	
mcar 	 10 	 1760.502510013707 	 stmvl10.txt 	
mcar 	 10 	 1.

In [8]:
#miss_over 	 3 	 5.706647689178418e-33 	 dynammo3.txt 	
scenario = 'miss_over'
test_case = 3
alg = 'dynammo3.txt'

result_file = workdir + '/' + scenario + '/' + dataset + '/recovery/values/' + str(test_case) + '/' + alg
result = pd.read_csv(result_file, delimiter=' ', header=None, index_col=0)
result = result.mean(axis=1)
result.rename(columns={'old_column_name': 'new_column_name'}, inplace=True)

#result.to_csv('Tide_pressure_missing_imputed.csv')

0
1        0.965
2        0.914
3        0.891
4        0.874
5        0.819
         ...  
16704   -1.539
16705   -1.500
16706   -1.450
16707   -1.403
16708   -1.338
Length: 16708, dtype: float64

In [None]:
import pandas as pd
test_case = 4
alg = 'tkcm'
result_file = workdir + '/' + scenario + '/' + dataset + '/recovery/values/' + str(test_case) + '/' + alg + str(test_case) + '.txt'
with open(result_file, 'r') as file:
    content = file.read()
    content = content.replace('\t', ' ')

with open(result_file, 'w') as file:
    file.write(content)
    
result = pd.read_csv(result_file, delimiter=' ', header=None, index_col=0)
mean_imputed = result.mean(axis=1)
mean_imputed = mean_imputed.values.tolist()
mean_imputed

In [27]:
reference_file = workdir + '/' + scenario + '/' + dataset + '/recovery/values/' + 'reference.txt'
with open(reference_file, 'r') as file:
    content = file.read()
    content = content.replace('\t', ' ')

with open(reference_file, 'w') as file:
    file.write(content)
    
reference = pd.read_csv(reference_file, delimiter=' ', header=None, index_col=0)
mean_reference = reference.mean(axis=1)
mean_reference = mean_reference.values.tolist()[:len(mean_imputed)]

(16708,)