In [1]:
import numpy as np
import pandas as pd
import h5py
import time 

In [37]:
#combining all the different filters

def get_2D_data_from_h5_filtered(h5_path, part_name, Slice_name, mode):
    #Step 1: getting the data from the h5
    start_time = time.time()
    with h5py.File(h5_path,'r') as h5:
        X_Axis = h5[part_name][Slice_name]['X-Axis']
        Y_Axis = h5[part_name][Slice_name]['Y-Axis']
        Area = h5[part_name][Slice_name]['Area']
        Intensity = h5[part_name][Slice_name]['Intensity']

        X_Axis_size = X_Axis.size
        Y_Axis_size = Y_Axis.size
        Area_size = Area.size
        Intensity_size = Intensity.size

        #if dimensions aren't equal the following code block is entered
        if not X_Axis_size == Y_Axis_size == Area_size == Intensity_size:

            #determine the lowest value among the different sizes
            size_arr = np.array([X_Axis_size, Y_Axis_size, Area_size, Intensity_size])
            min_size = size_arr.min()

            if X_Axis_size != min_size:
                diff_size_x = X_Axis_size - min_size #calculating the difference between the actual value and the minimum and substracting it from the array
                X_Axis_new = np.delete(X_Axis, -diff_size_x)
                X_Axis = X_Axis_new
                X_Axis_size = X_Axis.size

            if Y_Axis_size != min_size:
                diff_size_y = Y_Axis_size - min_size
                Y_Axis_new = np.delete(Y_Axis, -diff_size_y)
                Y_Axis = Y_Axis_new
                Y_Axis_size = Y_Axis.size

            if Area_size != min_size:
                diff_size_area = Area_size - min_size
                Area_new = np.delete(Area, -diff_size_area)
                Area = Area_new
                Area_size = Area.size

            if Intensity_size != min_size:
                diff_size_intensity = Intensity_size - min_size
                Intensity_new = np.delete(Intensity, -diff_size_intensity)
                Intensity = Intensity_new
                Intensity_size = Intensity.size


        #by reducing all the dimensions to the minimum equal dimensions are guaranteed
        #there is a risk of deleting more than just one datapoint without noticing -> maybe add an alert after more than 5(?) while iterations
        help_arr = np.column_stack((X_Axis, Y_Axis, Area, Intensity))
        df_raw = pd.DataFrame(help_arr, columns=['x','y','area','intensity'])
        
    #Step 2: change floats to ints and remove duplicates
    df_int = df_raw.astype(int).drop_duplicates()
    
    #remove all rows with 0 for area and intensity
    df_int = df_int.loc[(df_int['area'] != 0) & (df_int['intensity'] != 0)]
    
    
    #Step 3: Get a df with all the rows where a certain x,y combination occurs multiple times 
    df_multi_xy = df_int[df_int.duplicated(['x','y'], keep = False)]
    
    #Step 4: get a new df out of df_multi_xy with x,y and mean/max of area and intensity for all x,y occurences
    df_compact = pd.DataFrame(columns=['x','y','area','intensity']) #initialize df_compact
    
    print("vor iterieren %s seconds ---" % (time.time() - start_time))
    for ind in range (df_multi_xy.shape[0]):
        if mode == 'mean':
            area_mean = df_multi_xy.loc[(df_multi_xy['x']== df_multi_xy.iloc[ind]['x']) & (df_multi_xy['y'] == df_multi_xy.iloc[ind]['y'])]['area'].mean().astype(int)
            intensity_mean = df_multi_xy.loc[(df_multi_xy['x']== df_multi_xy.iloc[ind]['x']) & (df_multi_xy['y'] == df_multi_xy.iloc[ind]['y'])]['intensity'].mean().astype(int)
            df_compact = df_compact.append({'x': df_multi_xy.iloc[ind]['x'], 'y':df_multi_xy.iloc[ind]['y'], 'area':area_mean , 'intensity':intensity_mean}, ignore_index=True)
        if mode == 'max':
            area_max = df_multi_xy.loc[(df_multi_xy['x']== df_multi_xy.iloc[ind]['x']) & (df_multi_xy['y'] == df_multi_xy.iloc[ind]['y'])]['area'].max().astype(int)
            intensity_max = df_multi_xy.loc[(df_multi_xy['x']== df_multi_xy.iloc[ind]['x']) & (df_multi_xy['y'] == df_multi_xy.iloc[ind]['y'])]['intensity'].max().astype(int)
            df_compact = df_compact.append({'x': df_multi_xy.iloc[ind]['x'], 'y':df_multi_xy.iloc[ind]['y'], 'area':area_max , 'intensity':intensity_max}, ignore_index=True)
    df_compact = df_compact.drop_duplicates()
    
    #Step 5: remove df_multi_xy from df_int and append df_compact
    df_multi_xy_removed = df_int.drop(df_int[df_int.duplicated(['x','y'], keep = False)].index)
    
    df_final = df_multi_xy_removed.append(df_compact)
    print("--- %s seconds ---" % (time.time() - start_time))
    return (df_final)
    

In [38]:
get_2D_data_from_h5_filtered('/home/jan/Documents/CodeTDMStoHDF/Ausgangsdaten/examplerRun.h5', '0_00003_Canti3_cls', 'Slice00001','mean')

vor iterieren 0.05354189872741699 seconds ---
--- 80.88160824775696 seconds ---


Unnamed: 0,x,y,area,intensity
980,9564,-24983,349,619
983,9590,-24983,329,704
986,9612,-24983,294,666
987,9621,-24983,314,645
989,9637,-24983,287,619
...,...,...,...,...
16975,9768,-19584,373,540
16981,9567,-19744,248,518
16987,9688,-19584,407,527
16994,9567,-19664,369,595


In [17]:
#little tryout
test_df = pd.DataFrame(columns=['x','y','c','d'])
test_df = test_df.append({'x': 1, 'y':1, 'c':123 , 'd':123}, ignore_index=True)
test_df = test_df.append({'x': 1, 'y':1, 'c':123 , 'd':123}, ignore_index=True)
test_df = test_df.append({'x': 1, 'y':1, 'c':123 , 'd':123}, ignore_index=True)
test_df = test_df.append({'x': 2, 'y':2, 'c':123 , 'd':123}, ignore_index=True)
test_df = test_df.append({'x': 1, 'y':1, 'c':123 , 'd':123}, ignore_index=True)

test_df

Unnamed: 0,x,y,c,d
0,1,1,123,123
1,1,1,123,123
2,1,1,123,123
3,2,2,123,123
4,1,1,123,123


In [18]:
test_df[test_df.duplicated(['x','y'], keep = False)].index

Int64Index([0, 1, 2, 4], dtype='int64')

In [19]:
test_df.drop(test_df[test_df.duplicated(['x','y'], keep = False)].index)

Unnamed: 0,x,y,c,d
3,2,2,123,123


In [None]:
#https://stackoverflow.com/questions/20190668/multiprocessing-a-for-loop