In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import re
from glob import glob
import pandas as pd
import mrcfile
import json
from sklearn import preprocessing
import h5py
from tqdm import tqdm
from warnings import warn

In [2]:
!cat readme.md

# training data - part 1

#### subtomogram, density map, and lables(.json)
#### All training data: 10 types, 500 each, 5000 total

part 1: 4 types of macromolecules, 500 each (2000 total)
 
part 2: 6 types of macromolecules, 500 each (3000 total)


#### 1. subtomogram_mrc 
filenames: tomotarget*.mrc, * = 0,1,2,...,4999

This is our input data!
The subtomogram of a single macromolecule and some part of its neighbors. 
The size is 32 * 32 * 32.

.mrc file could be open with python package: mrcfile. 
It could be visualized using software: Chimera (https://www.cgl.ucsf.edu/chimera/). This software is very easy to use.

#### 2. subtomogram_png 
filenames: tomotarget*.png, * = 0,1,2,...,4999

the slices of a subtomogram. Sice the size of a subtomogram is 32^3, there are 32 subfigures in each image.

This is to help understand the content in the subtomogram.

#### 3. json:

##### there two types of json files in json folder:

filenames: target*.json: this is the

In [48]:
label_dict = {'1bxn': 0, 
              '1f1b': 1,
              '1yg6': 2,
              '2byu': 3, 
              '3gl1': 4,
              '4d4r': 5,
              '6t3e': 6,
              '2ldb': 7,
              '2h12': 8,
              '3hhb': 9}

In [49]:
def read(path):
    with mrcfile.open(path, 'r') as m:
        header = m.header
        data = m.data
        assert data.ndim == 3  
        data = data.transpose([2, 1, 0])    

    return {'header':header, 'data': data}

def read_mrcdata(path):
    return read(path)['data']

In [50]:
def read_jsondata(x):
    with open(x) as f:
        data = json.load(f)

    c_loc = data['loc']
    label = data['name']
    angle = data['rotate']
    
    return label

In [51]:
idx = np.array(range(0,2000))

In [52]:
df = pd.DataFrame(idx, columns=['idx'])
df['subtomogram_path'] = df['idx'].map(lambda x: './subtomogram_mrc/tomotarget%s.mrc' % x)
df['densitymap_path'] = df['idx'].map(lambda x: './densitymap_mrc/packtarget%s.mrc' % x)
df['label_path'] = df['idx'].map(lambda x: './json/target%s.json' % x)

In [53]:
df.head(5)

Unnamed: 0,idx,subtomogram_path,densitymap_path,label_path
0,0,./subtomogram_mrc/tomotarget0.mrc,./densitymap_mrc/packtarget0.mrc,./json/target0.json
1,1,./subtomogram_mrc/tomotarget1.mrc,./densitymap_mrc/packtarget1.mrc,./json/target1.json
2,2,./subtomogram_mrc/tomotarget2.mrc,./densitymap_mrc/packtarget2.mrc,./json/target2.json
3,3,./subtomogram_mrc/tomotarget3.mrc,./densitymap_mrc/packtarget3.mrc,./json/target3.json
4,4,./subtomogram_mrc/tomotarget4.mrc,./densitymap_mrc/packtarget4.mrc,./json/target4.json


In [54]:
df['subtom'] = df['subtomogram_path'].map(read_mrcdata)
df['density'] = df['densitymap_path'].map(read_mrcdata)
df['class'] = df['label_path'].map(read_jsondata)

df['label'] = df['class'].map(label_dict)
df['label'] = df.label.astype(int)

In [55]:
df['label'].value_counts()

3    500
2    500
1    500
0    500
Name: label, dtype: int64

In [56]:
all_class = np.array_split(df, 4)


In [57]:
source_train = []
source_test = []
target_train = []
target_test = []

for i in range(4):
    class_train, class_test = train_test_split(all_class[i], test_size=0.1)
    train_split = np.array_split(class_train, 2)
    test_split = np.array_split(class_test, 2)
    
    source_train.append(train_split[0])
    source_test.append(test_split[0])
    target_train.append(train_split[1])
    target_test.append(test_split[1])


In [58]:
source_train = pd.concat(source_train)
source_train.reset_index(drop=True, inplace=True)

In [59]:
source_test = pd.concat(source_test)
source_test.reset_index(drop=True, inplace=True)

In [60]:
target_train = pd.concat(target_train)
target_train.reset_index(drop=True, inplace=True)

In [61]:
target_test = pd.concat(target_test)
target_test.reset_index(drop=True, inplace=True)

In [62]:
def write_df_as_hdf(out_path, out_df):
    with h5py.File(out_path, 'w') as h:
        for k, arr_dict in tqdm(out_df.to_dict().items()): 
            try:
                s_data = np.stack(arr_dict.values(), 0)

                try:
                    h.create_dataset(k, data = s_data, compression = 'gzip')
                except TypeError as e: 
                    try:
                        h.create_dataset(k, data = s_data.astype(np.string_))
                    except TypeError as e2: 
                        print('%s could not be added to hdf5, %s' % (k, repr(e), repr(e2)))

            except ValueError as e:
                print('%s could not be created, %s' % (k, repr(e)))
                all_shape = [np.shape(x) for x in arr_dict.values()]

In [63]:
write_df_as_hdf('source_train.h5', source_train)

  """
100%|██████████| 8/8 [00:08<00:00,  1.09s/it]


In [64]:
write_df_as_hdf('source_test.h5', source_test)

  """
100%|██████████| 8/8 [00:00<00:00, 12.86it/s]


In [65]:
write_df_as_hdf('target_train.h5', target_train)

  """
100%|██████████| 8/8 [00:05<00:00,  1.39it/s]


In [66]:
write_df_as_hdf('target_test.h5', target_test)

  """
100%|██████████| 8/8 [00:00<00:00, 13.21it/s]


In [68]:
with h5py.File('source_train.h5', 'r') as scan_h5:
    source_train_subtom = scan_h5['subtom'][:]
    source_train_label = scan_h5['label'][:]

In [69]:
len(source_train_subtom)

900

In [70]:
class opt:
    train_path = 'source_train.h5'
    test_path = 'source_test.h5'

In [72]:
import utils

data = utils.DATA_LOADER(opt)
print("# of training samples: ", data.ntrain)

ModuleNotFoundError: No module named 'torch'