In [1]:
import numpy as np
import pandas as pd
import h5py

In [2]:
def get_2D_data_from_h5_with_dimension_check(h5_path, part_name, Slice_name):
    with h5py.File(h5_path,'r') as h5:
        X_Axis = h5[part_name][Slice_name]['X-Axis']
        Y_Axis = h5[part_name][Slice_name]['Y-Axis']
        Area = h5[part_name][Slice_name]['Area']
        Intensity = h5[part_name][Slice_name]['Intensity']

        X_Axis_size = X_Axis.size
        Y_Axis_size = Y_Axis.size
        Area_size = Area.size
        Intensity_size = Intensity.size

        #if dimensions aren't equal the following code block is entered
        if not X_Axis_size == Y_Axis_size == Area_size == Intensity_size:

            #determine the lowest value among the different sizes
            size_arr = np.array([X_Axis_size, Y_Axis_size, Area_size, Intensity_size])
            min_size = size_arr.min()

            if X_Axis_size != min_size:
                diff_size_x = X_Axis_size - min_size #calculating the difference between the actual value and the minimum and substracting it from the array
                X_Axis_new = np.delete(X_Axis, -diff_size_x)
                X_Axis = X_Axis_new
                X_Axis_size = X_Axis.size

            if Y_Axis_size != min_size:
                diff_size_y = Y_Axis_size - min_size
                Y_Axis_new = np.delete(Y_Axis, -diff_size_y)
                Y_Axis = Y_Axis_new
                Y_Axis_size = Y_Axis.size

            if Area_size != min_size:
                diff_size_area = Area_size - min_size
                Area_new = np.delete(Area, -diff_size_area)
                Area = Area_new
                Area_size = Area.size

            if Intensity_size != min_size:
                diff_size_intensity = Intensity_size - min_size
                Intensity_new = np.delete(Intensity, -diff_size_intensity)
                Intensity = Intensity_new
                Intensity_size = Intensity.size


        #by reducing all the dimensions to the minimum equal dimensions are guaranteed
        #there is a risk of deleting more than just one datapoint without noticing -> maybe add an alert after more than 5(?) while iterations
        help_arr = np.column_stack((X_Axis, Y_Axis, Area, Intensity))
        df = pd.DataFrame(help_arr, columns=['x','y','area','intensity'])
        return df

In [3]:
df_undocked = get_2D_data_from_h5_with_dimension_check('/home/jan/Documents/CodeTDMStoHDF/Ausgangsdaten/examplerRun.h5', '0_00003_Canti3_cls', 'Slice00001')

In [4]:
def get_true_min_maxX (h5_path, part_name, max_slice_number):

    minX = []
    maxX = []
    for num_slice in range(max_slice_number):
        with h5py.File(h5_path,'r') as h5:
            X_Axis = h5[part_name]['Slice'+str("{:05d}".format(num_slice+1))]['X-Axis']
            x_axis_array = np.array(X_Axis)
            minX.append(x_axis_array.min())
            maxX.append(x_axis_array.max())
    minX_array = np.asarray(minX)
    maxX_array = np.asarray(maxX)
    return minX_array.min(), maxX_array.max()

In [5]:
def get_true_min_maxY (h5_path, part_name, max_slice_number):

    minY = []
    maxY = []
    for num_slice in range(max_slice_number):
        with h5py.File(h5_path,'r') as h5:
            Y_Axis = h5[part_name]['Slice'+str("{:05d}".format(num_slice+1))]['Y-Axis']
            y_axis_array = np.array(Y_Axis)
            minY.append(y_axis_array.min())
            maxY.append(y_axis_array.max())
    minY_array = np.asarray(minY)
    maxY_array = np.asarray(maxY)
    return minY_array.min(), maxY_array.max()

In [6]:
minX = get_true_min_maxX('/home/jan/Documents/CodeTDMStoHDF/Ausgangsdaten/examplerRun.h5', '0_00003_Canti3_cls', 142)[0]
minY = get_true_min_maxY('/home/jan/Documents/CodeTDMStoHDF/Ausgangsdaten/examplerRun.h5', '0_00003_Canti3_cls', 142)[0]

In [7]:
def dock_df_to_zero(df, minX, minY):
    if minX >= 0 and minY >=0:
        df['x'] = df['x'] - minX
        df['y'] = df['y'] - minY
    elif minX < 0 and minY <0:
        df['x'] = df['x'] + abs(minX)
        df['y'] = df['y'] + abs(minY)
    elif minX >= 0 and minY <0:
        df['x'] = df['x'] - minX
        df['y'] = df['y'] + abs(minY)
    elif minX < 0 and min >= 0:
        df['x'] = df['x'] + abs(minX)
        df['y'] = df['y'] - minY
    return df

In [8]:
df_docked = dock_df_to_zero(df_undocked, minX, minY)
df_docked.describe()

Unnamed: 0,x,y,area,intensity
count,143201.0,143201.0,143201.0,143201.0
mean,4232.300781,4432.887207,144.456787,299.727081
std,544.545471,2567.269775,145.760574,287.940796
min,0.0,0.0,0.0,0.0
25%,3854.0,2287.0,0.0,0.0
50%,4254.0,4303.0,181.0,493.0
75%,4653.0,6583.0,237.0,553.0
max,4946.0,8996.0,1080.0,3023.0


In [15]:
def fill_2D_voxel_area (voxel_size, num_voxels_x, num_voxels_y, df, filling_method):
    counter = 0
    array_area = np.zeros([voxel_size,voxel_size]) #creating an empty array of dimensions voxel_size*voxel_size
    for i in range(voxel_size*num_voxels_x, voxel_size*(num_voxels_x+1)): #iterating over x
        for j in range(voxel_size*num_voxels_y,voxel_size*(num_voxels_y+1)): #iterating over y
            
            print('i: ' + str(i))
            print('j: ' + str(j))

            if df[(df['x'] == i) & (df['y'] == j)].shape[0] == 1: #here subset of the original dataframe is created an filtrered --> shape[0] returns the number of rows of this df subset
                #finding the area value for a certain point in the part-data-dataframe and allocating it to a position in the array
                area_i = df.loc[(df['x'] == i) & (df['y'] == j)]
                array_area[i-num_voxels_x*voxel_size][j-num_voxels_y*voxel_size] = area_i['Area']

            elif df[(df['x'] == i) & (df['y'] == j)].shape[0] > 1:
                #if there are more values than just one the maximum value is used for the voxel-datapoint; other methods of dealing with multiple values need to be considered
                array_area[i-num_voxels_x*voxel_size][j-num_voxels_y*voxel_size] = df[(df['x'] == i) & (df['y'] == j)]['area'].max()


            elif df[(df['x'] == i) & (df['y'] == j)].shape[0] == 0 and filling_method == 'Zeros':
                array_area[i-num_voxels_x*voxel_size][j-num_voxels_y*voxel_size] = 0

            #elif filling_method == 'Mean': #with this method all the missing datapoints are getting filled with the mean of the non-missing datapoints
             #   array_area[i][j] =

    return array_area
            #theoretisch nach 1145 Schluss, praktisch läuft er immer weiter

In [16]:
fill_2D_voxel_area(20, 2, 2, df_docked, 'Zeros')

i: 40
j: 40
i: 40
j: 41
i: 40
j: 42
i: 40
j: 43
i: 40
j: 44
i: 40
j: 45
i: 40
j: 46
i: 40
j: 47
i: 40
j: 48
i: 40
j: 49
i: 40
j: 50
i: 40
j: 51
i: 40
j: 52
i: 40
j: 53
i: 40
j: 54
i: 40
j: 55
i: 40
j: 56
i: 40
j: 57
i: 40
j: 58
i: 40
j: 59
i: 41
j: 40
i: 41
j: 41
i: 41
j: 42
i: 41
j: 43
i: 41
j: 44
i: 41
j: 45
i: 41
j: 46
i: 41
j: 47
i: 41
j: 48
i: 41
j: 49
i: 41
j: 50
i: 41
j: 51
i: 41
j: 52
i: 41
j: 53
i: 41
j: 54
i: 41
j: 55
i: 41
j: 56
i: 41
j: 57
i: 41
j: 58
i: 41
j: 59
i: 42
j: 40
i: 42
j: 41
i: 42
j: 42
i: 42
j: 43
i: 42
j: 44
i: 42
j: 45
i: 42
j: 46
i: 42
j: 47
i: 42
j: 48
i: 42
j: 49
i: 42
j: 50
i: 42
j: 51
i: 42
j: 52
i: 42
j: 53
i: 42
j: 54
i: 42
j: 55
i: 42
j: 56
i: 42
j: 57
i: 42
j: 58
i: 42
j: 59
i: 43
j: 40
i: 43
j: 41
i: 43
j: 42
i: 43
j: 43
i: 43
j: 44
i: 43
j: 45
i: 43
j: 46
i: 43
j: 47
i: 43
j: 48
i: 43
j: 49
i: 43
j: 50
i: 43
j: 51
i: 43
j: 52
i: 43
j: 53
i: 43
j: 54
i: 43
j: 55
i: 43
j: 56
i: 43
j: 57
i: 43
j: 58
i: 43
j: 59
i: 44
j: 40
i: 44
j: 41
i: 44
j: 42
i: 4

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.

In [9]:
df_docked

Unnamed: 0,x,y,area,intensity
0,0.0,8977.0,0.0,0.0
1,0.0,8977.0,0.0,0.0
2,0.0,8977.0,0.0,0.0
3,0.0,8977.0,0.0,0.0
4,0.0,8977.0,0.0,0.0
...,...,...,...,...
143196,3528.0,8977.0,0.0,0.0
143197,3528.0,8977.0,0.0,0.0
143198,3528.0,8977.0,0.0,0.0
143199,3528.0,8977.0,0.0,0.0


In [17]:
 #Syntax: DataFrame.drop_duplicates(subset=None, keep=’first’, inplace=False)
df_1 = df_docked.drop_duplicates()    
df_1

Unnamed: 0,x,y,area,intensity
0,0.0,8977.0,0.0,0.0
655,18.0,8950.0,0.0,0.0
656,29.0,8932.0,0.0,0.0
657,46.0,8905.0,0.0,0.0
658,58.0,8888.0,0.0,0.0
...,...,...,...,...
142958,3528.0,8977.0,162.0,0.0
142959,3528.0,8977.0,324.0,0.0
142960,3528.0,8977.0,266.0,0.0
142961,3528.0,8977.0,208.0,0.0


In [19]:
#Alle Werte, wo area und intensity gleich 0 sind auch noch rauskicken
df_2 = df_1.drop(df_1[(df_1.area == 0) and (df_1.intensity ==0)].index)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [21]:
#df = df[df.score > 50]

df_1 = df_1[(df_1.area != 0) and (df_1.intensity !=0)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [31]:
# delete all rows for which column 'Age' has value greater than 30 and Country is India 
#indexNames = dfObj[ (dfObj['Age'] >= 30) & (dfObj['Country'] == 'India') ].index
#dfObj.drop(indexNames , inplace=True)

indextodell_area = df_1[(df_1['area']==0)].index

In [32]:
indextodell_area

Int64Index([     0,    655,    656,    657,    658,    659,    660,    661,
               662,    663,
            ...
            142893, 142894, 142895, 142913, 142926, 142927, 142952, 142955,
            142956, 142957],
           dtype='int64', length=13168)

In [33]:
indextodell_intensity =  df_1[(df_1['intensity']==0)].index

In [34]:
indextodell_intensity

Int64Index([     0,    655,    656,    657,    658,    659,    660,    661,
               662,    663,
            ...
            142912, 142913, 142926, 142927, 142952, 142958, 142959, 142960,
            142961, 142963],
           dtype='int64', length=16422)

In [39]:
df_1.loc[(df_1['area'] != 0) & (df_1['intensity'] != 0) ]

Unnamed: 0,x,y,area,intensity
978,3510.0,3578.0,157.0,677.0
979,3519.0,3578.0,315.0,606.0
980,3525.0,3578.0,349.0,619.0
981,3535.0,3578.0,384.0,618.0
982,3541.0,3578.0,384.0,669.0
...,...,...,...,...
142903,3569.0,8977.0,206.0,508.0
142904,3569.0,8977.0,288.0,518.0
142905,3569.0,8977.0,370.0,501.0
142906,3569.0,8977.0,370.0,519.0


In [37]:
rslt_df = df_1.loc[(df_1['area'] != 0) & (df_1['intensity'] != 0)] 

In [38]:
rslt_df

Unnamed: 0,x,y,area,intensity
978,3510.0,3578.0,157.0,677.0
979,3519.0,3578.0,315.0,606.0
980,3525.0,3578.0,349.0,619.0
981,3535.0,3578.0,384.0,618.0
982,3541.0,3578.0,384.0,669.0
...,...,...,...,...
142903,3569.0,8977.0,206.0,508.0
142904,3569.0,8977.0,288.0,518.0
142905,3569.0,8977.0,370.0,501.0
142906,3569.0,8977.0,370.0,519.0


In [43]:
final_df = rslt_df.astype(int)
final_df

Unnamed: 0,x,y,area,intensity
978,3510,3578,157,677
979,3519,3578,315,606
980,3525,3578,349,619
981,3535,3578,384,618
982,3541,3578,384,669
...,...,...,...,...
142903,3569,8977,206,508
142904,3569,8977,288,518
142905,3569,8977,370,501
142906,3569,8977,370,519


In [46]:
((final_df['x'] == 3510) & (final_df['y'] == 3578)).any()

True

In [47]:
final_df.loc[(final_df['x']== 3569) & (final_df['y'] == 8977)]

#df_1.loc[(df_1['area'] != 0) & (df_1['intensity'] != 0)]

Unnamed: 0,x,y,area,intensity
142902,3569,8977,238,555
142903,3569,8977,206,508
142904,3569,8977,288,518
142905,3569,8977,370,501
142906,3569,8977,370,519
142907,3569,8977,341,525


In [50]:
#build a dataframe containing the maximum values of both columns in one row and dropping all the other rows 
# erst checken, ob mehrere Datenpunkte vorhanden sind
# dann max area herausfiltern
# dann max intensity herausfiltern 
# neuen Eintrag machen und alte Einträge löschen 

final_df.loc[(final_df['x']== 3569) & (final_df['y'] == 8977)].shape[0] # <- so kriegt man die Anzahl an Reihen raus


6

In [57]:
area = final_df.loc[(final_df['x']== 3569) & (final_df['y'] == 8977)]['area'].max()

In [58]:
intensity = final_df.loc[(final_df['x']== 3569) & (final_df['y'] == 8977)]['intensity'].max()

In [55]:
#zuerst die alten Zeilen rauskicken, dann eine neue einfügen 
final_df_2 = final_df.loc[(final_df['x']!= 3569) & (final_df['y'] != 8977)]

In [60]:
final_df_2.append({'area':area , 'intensity':intensity}, ignore_index=True)

#modDfObj = dfObj.append({'Name' : 'Sahil' , 'Age' : 22} , ignore_index=True)

Unnamed: 0,x,y,area,intensity
0,3510.0,3578.0,157.0,677.0
1,3519.0,3578.0,315.0,606.0
2,3525.0,3578.0,349.0,619.0
3,3535.0,3578.0,384.0,618.0
4,3541.0,3578.0,384.0,669.0
...,...,...,...,...
73917,3556.0,8965.0,313.0,533.0
73918,3561.0,8969.0,291.0,503.0
73919,3563.0,8972.0,270.0,503.0
73920,3568.0,8976.0,270.0,502.0
