In [18]:
import os
import matplotlib.pyplot as plt
import numpy as np
import re
from glob import glob
import pandas as pd
import mrcfile
import json
from sklearn import preprocessing
import h5py
from tqdm import tqdm
from warnings import warn
from sklearn.model_selection import train_test_split

In [2]:
!ls

1bxn  1f1b  1yg6  2byu	2h12  2ldb  3gl1  3hhb	4d4r  6t3e  Untitled.ipynb


In [56]:
snr = 'SNR003'

In [3]:
tomo_class = ['1bxn', '1f1b', '1yg6', '2byu', '2h12', '2ldb', '3gl1', '3hhb', '4d4r', '6t3e']

In [6]:
totClass = len(tomo_class)

In [7]:
labels = list(range(totClass))
label_dict = dict(zip(tomo_class, labels))

In [8]:
label_dict

{'1bxn': 0,
 '1f1b': 1,
 '1yg6': 2,
 '2byu': 3,
 '2h12': 4,
 '2ldb': 5,
 '3gl1': 6,
 '3hhb': 7,
 '4d4r': 8,
 '6t3e': 9}

In [9]:
def read(path):
    with mrcfile.open(path, 'r') as m:
        header = m.header
        data = m.data
        assert data.ndim == 3  
        data = data.transpose([2, 1, 0])    

    return {'header':header, 'data': data}

def read_mrcdata(path):
    return read(path)['data']

In [44]:
def read_jsondata(x):
    with open(x) as f:
        data = json.load(f)

    c_loc = data['loc']
    label = data['name']
    angle = data['rotate']
    
    return label

In [45]:
train_data = []
test_data = []

for class_name in tomo_class:
    idx = np.array(range(0,500))
    df = pd.DataFrame(idx, columns=['idx'])
    
    df['subtomogram_path'] = df['idx'].map(lambda x: './'+class_name+'/subtomogram_mrc/tomotarget%s.mrc' % x)
    df['label_path'] = df['idx'].map(lambda x: './'+class_name+'/json_label/target%s.json' % x)
    
    class_train, class_test = train_test_split(df, test_size=0.1)
    train_data.append(class_train)
    test_data.append(class_test)
    

In [46]:
train_data = pd.concat(train_data)
train_data.reset_index(drop=True, inplace=True)

In [47]:
test_data = pd.concat(test_data)
test_data.reset_index(drop=True, inplace=True)

In [48]:
train_data['subtom'] = train_data['subtomogram_path'].map(read_mrcdata)
train_data['class'] = train_data['label_path'].map(read_jsondata)

train_data['label'] = train_data['class'].map(label_dict)
train_data['label'] = train_data.label.astype(int)

In [51]:
test_data['subtom'] = test_data['subtomogram_path'].map(read_mrcdata)
test_data['class'] = test_data['label_path'].map(read_jsondata)

test_data['label'] = test_data['class'].map(label_dict)
test_data['label'] = test_data.label.astype(int)

In [52]:
train_data['label'].value_counts()

7    450
3    450
6    450
2    450
9    450
5    450
1    450
8    450
4    450
0    450
Name: label, dtype: int64

In [53]:
test_data['label'].value_counts()

9    50
8    50
7    50
6    50
5    50
4    50
3    50
2    50
1    50
0    50
Name: label, dtype: int64

In [54]:
def write_df_as_hdf(out_path, out_df):
    with h5py.File(out_path, 'w') as h:
        for k, arr_dict in tqdm(out_df.to_dict().items()): 
            try:
                s_data = np.stack(arr_dict.values(), 0)

                try:
                    h.create_dataset(k, data = s_data, compression = 'gzip')
                except TypeError as e: 
                    try:
                        h.create_dataset(k, data = s_data.astype(np.string_))
                    except TypeError as e2: 
                        print('%s could not be added to hdf5, %s' % (k, repr(e), repr(e2)))

            except ValueError as e:
                print('%s could not be created, %s' % (k, repr(e)))
                all_shape = [np.shape(x) for x in arr_dict.values()]

In [58]:
write_df_as_hdf(snr+'_train.h5', train_data)

  """
100%|██████████| 6/6 [00:54<00:00,  9.09s/it]


In [59]:
write_df_as_hdf(snr+'_test.h5', test_data)

  """
100%|██████████| 6/6 [00:02<00:00,  2.40it/s]
