# Single-Molecule Localization Microscopy (SMLM) 2D Digits 123 and TOL letters datasets

## Download data

1. Downloaded dataset from https://data.4tu.nl/articles/dataset/Single-Molecule_Localization_Microscopy_SMLM_2D_Digits_123_and_TOL_letters_datasets/14074091/1
2. Unzipped dataset

## Preprocess data

In [1]:
from scipy.io import loadmat
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pyarrow as pa
import json
import shutil

## Convert .mat to .parquet and add in class information

In [2]:
gt_label_map = {
    0: "one",
    1: "two",
    2: "three",
    3: "T",
    4: "O",
    5: "L",
}
gt_label_map_lookup = {
    "one": 0,
    "two": 1,
    "three":2,
    "T":3,
    "O":4,
    "L":5,
}
gt_label_map = json.dumps(gt_label_map).encode("utf-8")

In [None]:
####  testing

import matplotlib.pyplot as plt

## Create folder for preprocessed files
output_file_path = "./data/"
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

## Load in separated letters
folders = [
          "Letters/TOL_Imaged_Separately/Particles_L",
          "Letters/TOL_Imaged_Separately/Particles_O",
          "Letters/TOL_Imaged_Separately/Particles_T",
          "Digits/Imaged_Separately/Particles_1",
          "Digits/Imaged_Separately/Particles_2",
          "Digits/Imaged_Separately/Particles_3"
          ]
labels = [
    "L", 
    "O", 
    "T", 
    "one", 
    "two", 
    "three"
    ]
for index, f in enumerate(folders):
    assert len(labels) == len(folders)
    mat = loadmat(f)
    print(mat)
    num_particles = len(mat["Particles"][0,:])
    for i in range(num_particles):
        mat_pos = mat["Particles"][0,:][i][0][0][0]
        mat_sigma = mat["Particles"][0,:][i][0][0][1]
        x = pa.array(mat_pos[:,0])
        y = pa.array(mat_pos[:,1])
        sigma = pa.array(mat_sigma[:,0])
        channel = pa.array([0]*len(mat_pos[:,0]))
        if int(str(gt_label_map_lookup[labels[index]])) == 1:
            plt.scatter(x, y)
            plt.show()
            input("stop")


In [3]:
## Create folder for preprocessed files
output_file_path = "./data/"
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

## Load in separated letters
folders = ["Letters/TOL_Imaged_Separately/Particles_L",
          "Letters/TOL_Imaged_Separately/Particles_O",
          "Letters/TOL_Imaged_Separately/Particles_T",
          "Digits/Imaged_Separately/Particles_1",
          "Digits/Imaged_Separately/Particles_2",
          "Digits/Imaged_Separately/Particles_3"]
labels = ["L", "O", "T", "one", "two", "three"]
for index, f in enumerate(folders):
    assert len(labels) == len(folders)
    mat = loadmat(f)
    num_particles = len(mat["Particles"][0,:])
    for i in range(num_particles):
        mat_pos = mat["Particles"][0,:][i][0][0][0]
        mat_sigma = mat["Particles"][0,:][i][0][0][1]
        x = pa.array(mat_pos[:,0])
        y = pa.array(mat_pos[:,1])
        sigma = pa.array(mat_sigma[:,0])
        channel = pa.array([0]*len(mat_pos[:,0]))
        table = pa.table([x,y,sigma, channel], names=["x","y","sigma", "channel"])
        meta_data = {
            "gt_label": str(gt_label_map_lookup[labels[index]]),
            "gt_label_map": gt_label_map,
        }
        # merge existing with new meta data and save
        old_metadata = table.schema.metadata
        merged_metadata = {**meta_data, **(old_metadata or {})}
        table = table.replace_schema_metadata(merged_metadata)
        file = f"{labels[index]}_{i}.parquet"
        save_loc =  f'{output_file_path}/{file}'
        pq.write_table(table, save_loc)

## Split data into training (90%) and test set (10%)

In [19]:
output_train_path = "./data/train"
output_test_path = "./data/test"
if not os.path.exists(output_train_path):
    os.makedirs(output_train_path)
if not os.path.exists(output_test_path):
    os.makedirs(output_test_path)

In [None]:
files = os.listdir("./data/")
print("Length of files", len(files))

# split 90%/10% whole dataset
train_length  = int(0.9*len(files)) - 4 # the -4 is custom added to ensure divisble by num classes
test_length = len(files) - train_length
assert test_length % 6 == 0

# get per class split
class_test_length = int(test_length/6)

def split(file_id):
    sub_files = np.array([f for f in files if f.startswith(file_id)])
    np.random.shuffle(sub_files)
    test = sub_files[0:class_test_length]
    train = sub_files[class_test_length:]
    return train, test

# split files
L_train, L_test = split("L_")
O_train, O_test = split("O_")
T_train, T_test = split("T_")
one_train, one_test = split("one_")
two_train, two_test = split("two_")
three_train, three_test = split("three_")

# combine files
train = np.concatenate((L_train,O_train, T_train, one_train, two_train, three_train))
test = np.concatenate((L_test, O_test, T_test, one_test, two_test, three_test))

print(len(train))
print(len(test))

# move files
for file in train:
    src_path = os.path.join("./data/", file)
    dest_path = os.path.join("./data/train/", file)
    shutil.move(src_path, dest_path)
    
# move files
for file in test:
    src_path = os.path.join("./data/", file)
    dest_path = os.path.join("./data/test/", file)
    shutil.move(src_path, dest_path)

print("Train files ", len(os.listdir("./data/train")))
print("Test files ", len(os.listdir("./data/test")))

## Dataset breakdown

Total dataset: 14,351

Train: 12911 - [921 x L, 751 x T, 320 x O, 3915 x one, 4703 x two, 2301 x three]

Test: 1440 - [240 of each]

## Add in Grid class 

Initially we missed there was a class called particle grid therefore need to add this in 

We want to maintain the test set and just add to it - to avoid data leakage & We will do the same for the train set 
- We need to go through all the existing train/test files and change the gt_label_map to include a label for 6: "grid"

Then we need to process the grid images and
1. Take 240 images and move to the test set
2. Put the remaining in train

In [28]:
gt_label_map = {
    0: "one",
    1: "two",
    2: "three",
    3: "T",
    4: "O",
    5: "L",
    6: "grid",
}
gt_label_map = json.dumps(gt_label_map).encode("utf-8")

### Change gt label map for all files

In [35]:
train_folder = "./data/train"
test_folder = "./data/test"

train_files = os.listdir(train_folder)
test_files = os.listdir(test_folder)

for file in train_files:
    path = os.path.join(train_folder, file)
    table = pq.read_table(path)

    meta_data = {
        "gt_label_map": gt_label_map,
    }
    
    # merge existing with new meta data and save
    old_metadata = table.schema.metadata
    old_metadata[b"gt_label_map"] = gt_label_map
    table = table.replace_schema_metadata(old_metadata)
    pq.write_table(table, path)

for file in test_files:
    path = os.path.join(test_folder, file)
    table = pq.read_table(path)

    meta_data = {
        "gt_label_map": gt_label_map,
    }
    
    # merge existing with new meta data and save
    old_metadata = table.schema.metadata
    old_metadata[b"gt_label_map"] = gt_label_map
    table = table.replace_schema_metadata(old_metadata)
    pq.write_table(table, path)

### Process grid images
RUN FROM HERE

In [None]:
## Create folder for preprocessed files
output_file_path = "./data_grid/"

if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

## Load in separated letters
f = "Digits/Imaged_Separately/Particles_Grid"

mat = loadmat(f)

num_particles = len(mat["Particles"][0,:])
print(num_particles)
for i in range(num_particles):
    mat_pos = mat["Particles"][0,:][i][0][0][0]
    mat_sigma = mat["Particles"][0,:][i][0][0][1]
    x = pa.array(mat_pos[:,0])
    y = pa.array(mat_pos[:,1])
    sigma = pa.array(mat_sigma[:,0])
    channel = pa.array([0]*len(mat_pos[:,0]))
    table = pa.table([x,y,sigma, channel], names=["x","y","sigma", "channel"])
    
    meta_data = {
        "gt_label": str(6),
        "gt_label_map": gt_label_map,
    }

    # merge existing with new meta data and save
    old_metadata = table.schema.metadata
    merged_metadata = {**meta_data, **(old_metadata or {})}
    table = table.replace_schema_metadata(merged_metadata)
    file = f"grid_{i}.parquet"
    save_loc =  f'{output_file_path}/{file}'
    pq.write_table(table, save_loc)

### Split grid images into train/test

In [None]:
files = os.listdir("./data_grid/")
print("Length of files", len(files))

# split 90%/10% whole dataset
test_length  = 240
train_length = len(files) - test_length

# split files
np.random.shuffle(files)
test_files = files[0:test_length]
print(len(test_files))
train_files = files[test_length:]
print(len(train_files))

# move files
for file in train_files:
    src_path = os.path.join("./data_grid/", file)
    dest_path = os.path.join("./data/train/", file)
    shutil.move(src_path, dest_path)
    
# move files
for file in test_files:
    src_path = os.path.join("./data_grid/", file)
    dest_path = os.path.join("./data/test/", file)
    shutil.move(src_path, dest_path)

print("Train files ", len(os.listdir("./data/train")))
print("Test files ", len(os.listdir("./data/test")))

## Dataset breakdown

Total dataset: 22,047

Train: 20,367 - [921 x L, 751 x T, 320 x O, 3915 x one, 4703 x two, 2301 x three, 7456 x grid]

Test: 1,680 - [240 of each]