## Convert THEx Data to HDF5
1. load CSVs

In [3]:
import pandas as pd
# 
#  Data Note
#  The following data comes from an initialized THEx MultiModel. 
#  So, it represents all rows that have valid values for the mags and colors
#  
dpath = "/Users/marina/Documents/PhD/research/astro_research/data/testing/"
# Load CSVs of X and y
X = pd.read_csv(dpath + "X.csv")
y = pd.read_csv(dpath + "y.csv")

## Save to HDF5 with same format as WINE dataset
Hierarhcy:
- all
- folds
    - 1
        - tests
            - 1
        - training
            - 1
            - 2
            - ...
            - 8
            - 9
    - 2
    - ...
    - 9


Usage:

- Use folds/1/training/ 1 - 8 as training
- Use folds/1/training/9 as validation
- Use folds/1/testing/1 as testing


In [4]:
# Save X of certain class to HDF5 File
class_name = "Unspecified Ia"
class_indices = y.loc[y['transient_type'].str.contains(class_name)].index
class_X = X.iloc[class_indices]

Split into folds, randomly

In [5]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True)
fold_sets = []
for remaining_indices, fold_indices in kf.split(class_X):
    fold_sets.append(fold_indices)
training_folds=[]
for i in range(8):
    training_folds.append(class_X.iloc[fold_sets[i]])

val_fold = class_X.iloc[fold_sets[8]]
test_fold = class_X.iloc[fold_sets[9]]


In [9]:
import numpy as np
np.float32
test_fold.to_numpy(dtype=np.float32)

array([[ 1.6366318e+01,  1.6295883e+01,  1.6851646e+01, ...,
        -3.8579464e-01, -2.8179741e-01, -1.3949776e-01],
       [ 1.2880881e+01,  1.2980280e+01,  1.3428139e+01, ...,
        -3.9757919e-01, -1.8321991e-01, -2.0078182e-01],
       [ 1.6922680e+01,  1.7184122e+01,  1.7317171e+01, ...,
        -3.0387878e-01, -1.0267487e+00, -2.3467064e-01],
       ...,
       [ 1.3286318e+01,  1.3557161e+01,  1.3875146e+01, ...,
        -4.1408443e-01, -1.9860840e-01, -2.0015907e-01],
       [ 1.9503904e+01,  1.9722898e+01,  1.9776192e+01, ...,
        -1.9147301e-01,  1.1680794e-01,  2.4200630e-01],
       [ 1.8643970e+01,  1.8365751e+01,  1.9170395e+01, ...,
         2.1871376e-01, -7.4825287e-03, -4.9394608e-02]], dtype=float32)

### HDF5 Background
Groups are the container mechanism by which HDF5 files are organized. From a Python perspective, they operate somewhat like dictionaries. In this case the “keys” are the names of group members, and the “values” are the members themselves (Group and Dataset) objects.

From here https://docs.h5py.org/en/stable/high/group.html

Pandas has a method HDFStore, but I found it would not work well. It was making 4 members for each DataFrame. so I used h5py directly

the following works, at least it has the same members and structure as wine dataset

In [40]:
import h5py
thex_data_path = dpath + class_name.replace(" ", "_") + 'X.hdf5'

hfile = h5py.File(thex_data_path, 'w')


# define & fill groups
for i in range(8):
    training = hfile.create_group("folds/1/training/" + str(i+1))
    data = training_folds[i].to_numpy(dtype=np.float32)
    dset = training.create_dataset("data", data=data)

    
val = hfile.create_group("folds/1/training/9")
dset = val.create_dataset("data", data=val_fold.to_numpy(dtype=np.float32))
    

val = hfile.create_group("folds/1/tests/1")
dset = val.create_dataset("data", data=test_fold.to_numpy(dtype=np.float32))


hfile.close()

IOError: Unable to create file (unable to truncate a file which is already open)

## Init BigDataset with THEx HDF5 data

In [12]:
import sys
sys.path
sys.path.append('buml') 
import os
import Data.utils

dpath = "/Users/marina/Documents/PhD/research/astro_research/data/testing/"
thex_data_path = dpath + "Unspecified_IaX.hdf5"

# os.environ["DATASETSPATH"]=""
# data_source = "red_wine.hdf5"
# dataset_file = os.path.join(os.environ["DATASETSPATH"], data_source)
dataset_file = thex_data_path
training_dataset = Data.BigDataset(dataset_file, 
                                   "/folds/1/training/(1|2|3|4|5|6|7|8)", 
                                   "data") 
testing_dataset = Data.BigDataset(dataset_file, "/folds/1/tests/.*", "data")
validation_dataset = Data.BigDataset(dataset_file, "/folds/1/training/9", "data")



 BigDataset Working on /Users/marina/Documents/PhD/research/astro_research/data/testing/Unspecified_IaX.hdf5
Entries /folds/1/training/(1|2|3|4|5|6|7|8)


 BigDataset Working on /Users/marina/Documents/PhD/research/astro_research/data/testing/Unspecified_IaX.hdf5
Entries /folds/1/tests/.*


 BigDataset Working on /Users/marina/Documents/PhD/research/astro_research/data/testing/Unspecified_IaX.hdf5
Entries /folds/1/training/9


In [13]:
validation_dataset.sample_data(3)

(array([[16.976038  , 17.019712  , 17.436167  , 17.2412    , 17.660822  ,
         16.198233  , 18.533974  , 16.52771   , 16.403433  , 16.841549  ,
          0.7778053 , -0.22148705,  0.46012878, -0.41962242, -0.8731518 ,
         -0.2052002 , -0.31383896, -0.12427711, -0.17816353],
        [13.7143755 , 14.08379   , 14.070934  , 14.294066  , 14.621517  ,
         13.278459  , 15.238683  , 13.500901  , 13.16716   , 13.851856  ,
          0.4359169 , -0.2102766 ,  0.3565588 , -0.32745075, -0.61716557,
          0.11129856, -0.350955  , -0.3337412 , -0.2319336 ],
        [17.156197  , 17.688894  , 17.606287  , 17.89682   , 18.546644  ,
         16.648182  , 19.960075  , 17.252419  , 16.703648  , 17.818108  ,
          0.5080147 , -0.2079258 ,  0.4500904 , -0.64982414, -1.4134312 ,
         -0.0554657 , -0.5656891 , -0.5487709 ,  0.12921333]],
       dtype=float32),)

## Wine Dataset Study

Outcomes

1. 159 samples in each training fold
2. 168 in validation 
3. 159 in testing

There is OVERLAP among all 3. So testing data inside of training and validation.
46/159 testing samples in the whole training set
9/159 testing samples in validation 

And overlap in training/validation.

wine dataset HDF5 data has the following hierarchy:
- all
- folds
    - 1
        - tests
            - 1
        - training
            - 1
            - 2
            - ...
            - 8
            - 9
    - 2
    - ...
    - 9

### Examine if there is any data overlap in training/validation/testing

In [5]:
import sys
sys.path
sys.path.append('buml') 
import os
import Data.utils
os.environ["DATASETSPATH"]=""
data_source = "red_wine.hdf5"
training = "/folds/1/training/(1|2|3|4|5|6|7|8)"
dataset_file = os.path.join(os.environ["DATASETSPATH"], data_source)
training_dataset = Data.BigDataset(dataset_file, training, "data")
testing_dataset = Data.BigDataset(dataset_file, "/folds/1/tests/.*", "data")
validation_dataset = Data.BigDataset(dataset_file, "/folds/1/training/9", "data")



 BigDataset Working on red_wine.hdf5
Entries /folds/1/training/(1|2|3|4|5|6|7|8)


 BigDataset Working on red_wine.hdf5
Entries /folds/1/tests/.*


 BigDataset Working on red_wine.hdf5
Entries /folds/1/training/9


In [34]:
a=testing_dataset.get_file(0, 0)[1]

In [35]:
a.shape

(19,)

In [37]:
b=a.T
b.shape

(19,)

In [39]:
np.atleast_2d(a).T

array([[12.880881  ],
       [12.98028   ],
       [13.428139  ],
       [13.180984  ],
       [13.500116  ],
       [12.357191  ],
       [14.06948   ],
       [12.381919  ],
       [12.198699  ],
       [12.779498  ],
       [ 0.5236902 ],
       [-0.20070362],
       [ 0.5472574 ],
       [-0.3191328 ],
       [-0.5693636 ],
       [ 0.15849209],
       [-0.3975792 ],
       [-0.18321991],
       [-0.20078182]], dtype=float32)

In [None]:
# Make sure no testing/training/validation overlaps
training_indices= [0,1,2,3,4,5,6,7]
validation_index = [8]

test_overlap_count = 0
for i in range(testing_fold.shape[0]):
    test_sample = testing_fold[i]
    # No overlap with training/testing
    total_training = 0 #to keep track of later.
    for ti in training_indices:
        training_fold = training_dataset.get_file(element = 0, index= ti)
        for row in training_fold:
            total_training +=1
            if (row == test_sample).all():
                test_overlap_count+=1
print("Overlap in testing and training " + str(test_overlap_count))

In [None]:
# No overlap with validation/testing
validation_fold = validation_dataset.get_file(element = 0, index= 0)
val_overlap_count = 0
for i in range(testing_fold.shape[0]):
    test_sample = testing_fold[i]
    for row in validation_fold:
        if (row == test_sample).all():
            val_overlap_count+=1
print("Overlap in validation and testing " + str(val_overlap_count))

In [None]:
# No overlap with training/validation
matching_rows=0
for i in range(validation_fold.shape[0]):
    validation_sample = validation_fold[i]
    for ti in training_indices:
        training_fold = training_dataset.get_file(element = 0, index= ti)
        for row in training_fold:
            if (row == validation_sample).all():
                matching_rows +=1
print("Number of matching rows in validation + training " + str(matching_rows))


In [None]:
print("Total rows in training " + str(total_training))
print("Total rows in validation " + str(validation_fold.shape[0]))
print("Total rows in testing " + str(testing_fold.shape[0]))

### Manually pulling down wine-dataset and manually examining entries 

In [None]:
import h5py
wine_f = h5py.File("red_wine.hdf5", "r")
# wine_f["/"]

print(wine_f.name)
print(wine_f.keys)

In [None]:

element_names="data"
element_names = element_names if isinstance(element_names, tuple) else (element_names,)

# entries_regexp = "/folds/1/training/(1|2|3|4|5|6|7|8)" # Training 
# entries_regexp="/folds/1/tests/.*"  # Testing 
entries_regexp = "/folds/1/training/9"  # Validation 
pats = entries_regexp.split("/")
pats.remove("")

"""
Coming up with entries:

We pass in "/folds/1/training/(1|2|3|4|5|6|7|8)" as the 'entries_regexp' 
so we are saying to use folds 1-8 for training

So, it selects the parts of the HDF5 dataset that are located at the part of the hierarhcy we
are selecting, using the regexp. 

- Use folds/1/training/ 1 - 8 as training
- Use folds/1/training/9 as validation
- Use folds/1/testing/1 as testing

""" 
import re
entries = [wine_f["/"]] 
for p in pats:
    new_entries = []
    for r in entries:
        for k, v in r.items(): 
            # Seeing if value of this HDF5 is one of the desired patterns
            if re.match("^%s$" % p, str(k)):
                new_entries.append(v)
    entries = new_entries 
entries

#### The above does the exact same thing as:
# entries = [wine_f["/"]]
# for p in pats:
#     entries = [v for r in entries for k,
#                v in r.items() if re.match("^%s$" % p, str(k))]

## Match HDF5 Data to Wine dataset on Kaggle site
This is to ensure these are the same dataset, and they do appear to be. 

OUTCOME:

Data comes from wine dataset from Kaggle
Input variables (based on physicochemical tests):
1. fixed acidity (tartaric acid - g / dm^3)
2. volatile acidity (acetic acid - g / dm^3)
3. citric acid (g / dm^3)
4. residual sugar (g / dm^3)
5. chlorides (sodium chloride - g / dm^3
6. free sulfur dioxide (mg / dm^3)
7. total sulfur dioxide (mg / dm^3)
8. density (g / cm^3)
9. pH
10. sulphates (potassium sulphate - g / dm3)
11. alcohol (% by volume)

Output variable (based on sensory data): 
1. quality (score between 0 and 10)

In [None]:
import sys
sys.path
sys.path.append('buml') 
import os
import Data.utils
os.environ["DATASETSPATH"]=""
data_source = "red_wine.hdf5"
training = "/folds/1/training/(1|2|3|4|5|6|7|8)"
dataset_file = os.path.join(os.environ["DATASETSPATH"], data_source)
training_dataset = Data.BigDataset(dataset_file, training, "data")

In [None]:
# Get sample row and see if it is in other dataset
training_fold_0 = training_dataset.get_file(element = 0, index= 0)
sample_row = training_fold_0[0]
sample_row.tolist()

In [None]:
import pandas as pd
wine_test_path = "~/Documents/PhD/research/astro_research/data/testing/wineQualityReds.csv"
wine_dataset = pd.read_csv(wine_test_path)

train_wine_dataset = wine_dataset.drop(columns=['Unnamed: 0', 'quality'])

In [None]:
import numpy as np
for index, row in train_wine_dataset.iterrows():
    
    rtol = 1e-05
    atol = 1e-08
    res = np.allclose(sample_row, row, rtol, atol) 
    if res:
        print(row)
        print("Row index match: " + str(index))
        
        

In [None]:
# options, args =parser.parse_args(["--theano", 
#                                   "--form", "MoG", 
#                                   "--dataset", "red_wine.hdf5", 
#                                   "--training_route", "/folds/1/training/(1|2|3|4|5|6|7|8)",
#                                  "--validation_route", "/folds/1/training/9",
#                                  "--test_route", "/folds/1/tests/.*",
#                                  "--samples_name", "data",
#                                  "--hlayers", "2", # 2 hidden layers
#                                   "--layerwise",
#                                   "--lr", "0.02",
#                                   "--wd", "0.02",
#                                   "--n_components", "10",
#                                   "--epoch_size", "100",
#                                   "--momentum", "0.9",
#                                   "--units", "100",
#                                   "--pretraining_epochs", "5",
#                                   "--validation_loops", "20",
#                                   "--epochs", "20",
#                                   "--normalize",
#                                   "--batch_size", "100",
#                                   "--show_training_stop", "red_wine"])
 