In [1]:
# normalize fish coordinate systems
# map neurons to a limited number of 3D areas
# map each 3D area to a component it is loaded on
# -> should this be normalized somehow?
# compare these arrays

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data_dir = '../data/'
loading_component_dir = data_dir + 'data_synthesized_pickled/'
meta_dir = data_dir + 'data_meta_pickled/'

In [4]:
def load_fish_datasets():
    file_path_list = list(
        filter(
            lambda path: '_loading_map.pickle' in path if True else False,
            os.listdir(path = loading_component_dir)
    ))
    file_list = []
    for file_path in file_path_list:
        file = open_file(file_path)
        file_list.append((file, file_path.split('loading_map')[0]))
    return file_list

In [5]:
def open_file(file_path):
    with open(loading_component_dir + file_path, "rb") as input_file:
        return pickle.load(input_file)

In [6]:
def pickle_file(file):
    with open(meta_dir + 'component_geographical_dist_dict.pickle', 'wb') as pickle_file:
        pickle.dump(file, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
num_x_steps = 40
num_y_steps = 20
num_z_steps = 2

def normalize_coordinate_axis(data, step_num):
    data_min = data.min()
    data_max = data.max()
    data_step_size = (data_max - data_min)/step_num
    data_center = (data_max + data_min / 2)
    data_norm_coord = data.sub(data_center).div(data_step_size)
    return data_norm_coord

def normalize_fish_coordinates(input_df):
    input_df['x'] = normalize_coordinate_axis(input_df['x'], num_x_steps)
    input_df['y'] = normalize_coordinate_axis(input_df['y'], num_y_steps)
    input_df['z'] = normalize_coordinate_axis(input_df['z'], num_z_steps)
    return input_df

In [8]:
min_neurons_to_construct_area = 75
def calculate_coordinate_loadings(input_df):
    x_coordinates = range(int(-num_x_steps/2), int(num_x_steps/2))
    y_coordinates = range(int(-num_y_steps/2), int(num_y_steps/2))
    z_coordinates = range(int(-num_z_steps/2), int(num_z_steps/2))
                
    #creating result dataframe
    y_axis = list(range(int(-num_y_steps/2), int(num_y_steps/2))) * num_z_steps
    z_axis = np.repeat(range(int(-num_z_steps/2), int(num_z_steps/2)), num_y_steps)
    coord_df = pd.DataFrame(
        np.zeros(((num_y_steps * num_z_steps), num_x_steps)),
        columns=range(int(-num_x_steps/2), int(num_x_steps/2))
    )
    coord_df['y'] = np.reshape(y_axis, (-1, 1))
    coord_df['z'] = np.reshape(z_axis, (-1, 1))
    coord_df = coord_df.set_index(['y','z'])
    coord_df_component_array = [coord_df.copy() for comp in range (0,10)]
    
    #iterate over each point (area) in result dataframe and calculate which components are present
    z_view = input_df
    old_z = z_coordinates[0] - 1
    empty_component_series = pd.Series(np.zeros(10), index=range(0,10))
    for (y,z) in coord_df.index:
        if z != old_z:
            z_view = input_df[(input_df['z'] >= z) & (input_df['z'] < z + 1)]
            old_z=z
        y_view = z_view[(z_view['y'] >= y) & (z_view['y'] < y + 1)]
        for x in coord_df.columns:
            x_view = y_view[(y_view['x'] >= x) & (y_view['x'] < x + 1)]
            if x_view.shape[0] < min_neurons_to_construct_area:
                for component in range(0,empty_component_series.size):
                    coord_df_component_array[component].at[(y,z), x] = 0
            else:
                loading_counts = x_view.groupby(['loadings']).count()['x'].add(empty_component_series, fill_value=0).to_numpy()
                neuron_count = np.sum(loading_counts)
                norm_component_loadings = np.divide(loading_counts, neuron_count)
                for component in range(0,norm_component_loadings.size):
                    coord_df_component_array[component].at[(y,z), x] = norm_component_loadings[component]
    return coord_df_component_array

In [9]:
datasets = load_fish_datasets()
dataset_dict = {}
for file, path in datasets:
    normalize_fish_coordinates(file)
    coord_df_component_array = calculate_coordinate_loadings(file)
    for component in range(0, len(coord_df_component_array)):
        dataset_dict[path + 'component_' + str(component)] = coord_df_component_array[component]
pickle_file(dataset_dict)

In [10]:
dataset_dict.keys()

dict_keys(['subject_10_component_0', 'subject_10_component_1', 'subject_10_component_2', 'subject_10_component_3', 'subject_10_component_4', 'subject_10_component_5', 'subject_10_component_6', 'subject_10_component_7', 'subject_10_component_8', 'subject_10_component_9', 'subject_10_stimulus_0_component_0', 'subject_10_stimulus_0_component_1', 'subject_10_stimulus_0_component_2', 'subject_10_stimulus_0_component_3', 'subject_10_stimulus_0_component_4', 'subject_10_stimulus_0_component_5', 'subject_10_stimulus_0_component_6', 'subject_10_stimulus_0_component_7', 'subject_10_stimulus_0_component_8', 'subject_10_stimulus_0_component_9', 'subject_10_stimulus_10_component_0', 'subject_10_stimulus_10_component_1', 'subject_10_stimulus_10_component_2', 'subject_10_stimulus_10_component_3', 'subject_10_stimulus_10_component_4', 'subject_10_stimulus_10_component_5', 'subject_10_stimulus_10_component_6', 'subject_10_stimulus_10_component_7', 'subject_10_stimulus_10_component_8', 'subject_10_stimu