In [5]:
import numpy as np
import h5py
import pandas as pd 
import time 

In [64]:
def get_2D_data_from_h5_filtered(h5_path, part_name, Slice_name, mode):
    #Step 1: getting the data from the h5
    start_time = time.time()
    with h5py.File(h5_path,'r') as h5:
        X_Axis = h5[part_name][Slice_name]['X-Axis']
        Y_Axis = h5[part_name][Slice_name]['Y-Axis']
        Area = h5[part_name][Slice_name]['Area']
        Intensity = h5[part_name][Slice_name]['Intensity']

        X_Axis_size = X_Axis.size
        Y_Axis_size = Y_Axis.size
        Area_size = Area.size
        Intensity_size = Intensity.size

        #if dimensions aren't equal the following code block is entered
        if not X_Axis_size == Y_Axis_size == Area_size == Intensity_size:

            #determine the lowest value among the different sizes
            size_arr = np.array([X_Axis_size, Y_Axis_size, Area_size, Intensity_size])
            min_size = size_arr.min()

            if X_Axis_size != min_size:
                diff_size_x = X_Axis_size - min_size #calculating the difference between the actual value and the minimum and substracting it from the array
                X_Axis_new = np.delete(X_Axis, -diff_size_x)
                X_Axis = X_Axis_new
                X_Axis_size = X_Axis.size

            if Y_Axis_size != min_size:
                diff_size_y = Y_Axis_size - min_size
                Y_Axis_new = np.delete(Y_Axis, -diff_size_y)
                Y_Axis = Y_Axis_new
                Y_Axis_size = Y_Axis.size

            if Area_size != min_size:
                diff_size_area = Area_size - min_size
                Area_new = np.delete(Area, -diff_size_area)
                Area = Area_new
                Area_size = Area.size

            if Intensity_size != min_size:
                diff_size_intensity = Intensity_size - min_size
                Intensity_new = np.delete(Intensity, -diff_size_intensity)
                Intensity = Intensity_new
                Intensity_size = Intensity.size


        #by reducing all the dimensions to the minimum equal dimensions are guaranteed
        #there is a risk of deleting more than just one datapoint without noticing -> maybe add an alert after more than 5(?) while iterations
        help_arr = np.column_stack((X_Axis, Y_Axis, Area, Intensity))
        df_raw = pd.DataFrame(help_arr, columns=['x','y','area','intensity'])

    #Step 2: change floats to ints and remove duplicates
    df_int = df_raw.astype(int).drop_duplicates()

    #remove all rows with 0 for area and intensity
    df_int = df_int.loc[(df_int['area'] != 0) & (df_int['intensity'] != 0)]


    #Step 3: Get a df with all the rows where a certain x,y combination occurs multiple times
    df_multi_xy = df_int[df_int.duplicated(['x','y'], keep = False)].reset_index()
    print(df_multi_xy)

    #Step 4: get a new df out of df_multi_xy with x,y and mean/max of area and intensity for all x,y occurences
    df_compact = pd.DataFrame(columns=['x','y','area','intensity']) #initialize df_compact

    print("vor iterieren %s seconds ---" % (time.time() - start_time))
    for ind in range (df_multi_xy.shape[0]):
        #print(ind)
        if mode == 'mean':
            try:
                area_mean = df_multi_xy.loc[(df_multi_xy['x']== df_multi_xy.iloc[ind]['x']) & (df_multi_xy['y'] == df_multi_xy.iloc[ind]['y'])]['area'].mean().astype(int)
                intensity_mean = df_multi_xy.loc[(df_multi_xy['x']== df_multi_xy.iloc[ind]['x']) & (df_multi_xy['y'] == df_multi_xy.iloc[ind]['y'])]['intensity'].mean().astype(int)
                df_compact = df_compact.append({'x': df_multi_xy.iloc[ind]['x'], 'y':df_multi_xy.iloc[ind]['y'], 'area':area_mean,'intensity':intensity_mean }, ignore_index=True) #, 'intensity':intensity_mean
                new_data_line = pd.DataFrame({'x': df_multi_xy.iloc[ind]['x'], 'y':df_multi_xy.iloc[ind]['y'], 'area':area_mean, 'intensity':intensity_mean}, index=[0])
                #print('df_compact')
                #print(df_compact)
                df_multi_xy = df_multi_xy[(df_multi_xy['x']!= df_multi_xy.iloc[ind]['x']) & (df_multi_xy['y'] != df_multi_xy.iloc[ind]['y'])]
                #print('after step 1')
                #print(test_df)
                df_multi_xy = pd.concat([new_data_line, df_multi_xy.iloc[:]]).reset_index(drop=True)
                #print('after step 2')
                #print(test_df)
            except:
                pass
        if mode == 'max':
            area_max = df_multi_xy.loc[(df_multi_xy['x']== df_multi_xy.iloc[ind]['x']) & (df_multi_xy['y'] == df_multi_xy.iloc[ind]['y'])]['area'].max().astype(int)
            intensity_max = df_multi_xy.loc[(df_multi_xy['x']== df_multi_xy.iloc[ind]['x']) & (df_multi_xy['y'] == df_multi_xy.iloc[ind]['y'])]['intensity'].max().astype(int)
            df_compact = df_compact.append({'x': df_multi_xy.iloc[ind]['x'], 'y':df_multi_xy.iloc[ind]['y'], 'area':area_max , 'intensity':intensity_max}, ignore_index=True)
    df_compact = df_compact.drop_duplicates()

    #Step 5: remove df_multi_xy from df_int and append df_compact
    df_multi_xy_removed = df_int.drop(df_int[df_int.duplicated(['x','y'], keep = False)].index)

    df_final = df_multi_xy_removed.append(df_compact)
    print("df creation took %s seconds ---" % (time.time() - start_time))
    return (df_final)


In [65]:
get_2D_data_from_h5_filtered( '/home/jan/Documents/CodeTDMStoHDF/Ausgangsdaten/examplerRun.h5', '0_00003_Canti3_cls', 'Slice00065', 'mean')

        index      x      y  area  intensity
0        1205  10985 -25024    70        444
1        1206  10985 -25033   141        471
2        1207  10985 -25033   171        536
3        1208  10979 -25033   201        579
4        1211  10957 -25033   232        533
...       ...    ...    ...   ...        ...
18243  142504   9608 -19584   411        641
18244  142505   9608 -19584   362        668
18245  142506   9608 -19584   314        621
18246  142555   9567 -19584   175        555
18247  142556   9567 -19584   351        629

[18248 rows x 5 columns]
vor iterieren 0.04676628112792969 seconds ---


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




df creation took 1.2474944591522217 seconds ---


Unnamed: 0,x,y,area,intensity
1209,10972,-25033,201,564
1210,10963,-25033,216,557
1212,10947,-25033,232,595
1213,10941,-25033,249,527
1214,10931,-25033,266,610
...,...,...,...,...
130,10877,-21234,217,487
131,10889,-20982,358,676
132,10861,-20970,451,804
133,10901,-20730,258,531


In [59]:
#basic idea: reduce the number of iterations 
#in the dataframe with the duplicates - remove a certain x,y occurences directly after the first one has been processed 
data_dict = {'x': [0,0,0,1,1,1,2,2,2,4,4,4,4,5,5,5,5,6,6,6], 'y':[0,0,0,1,1,1,2,2,2,4,4,4,4,5,5,5,5,6,6,6], 'area': [23,34,21,34,56,45,212,432,543,232,332,21,34,345,54,76,87,34,23,87]}
test_df = pd.DataFrame(data = data_dict)

In [46]:
test_df

Unnamed: 0,x,y,area
0,0,0,23
1,0,0,34
2,0,0,21
3,1,1,34
4,1,1,56
5,1,1,45
6,2,2,212
7,2,2,432
8,2,2,543
9,4,4,232


In [47]:
test_df[(test_df['x']!= test_df.iloc[1]['x']) & (test_df['y'] != test_df.iloc[1]['y'])].reset_index(drop=True)

Unnamed: 0,x,y,area
0,1,1,34
1,1,1,56
2,1,1,45
3,2,2,212
4,2,2,432
5,2,2,543
6,4,4,232
7,4,4,332
8,4,4,21
9,4,4,34


In [26]:
df_compact = pd.DataFrame(columns=['x','y','area'])

In [27]:
for ind in range (test_df.shape[0]):
    #print(ind)
    area_mean = test_df.loc[(test_df['x']== test_df.iloc[ind]['x']) & (test_df['y'] == test_df.iloc[ind]['y'])]['area'].mean().astype(int)
    #intensity_mean = test_df.loc[(test_df['x']== test_df.iloc[ind]['x']) & (test_df['y'] == test_df.iloc[ind]['y'])]['intensity'].mean().astype(int)
    df_compact = df_compact.append({'x': test_df.iloc[ind]['x'], 'y':test_df.iloc[ind]['y'], 'area':area_mean }, ignore_index=True) #, 'intensity':intensity_mean

In [None]:
#getting the number of unique x and y combinations 

In [36]:
test_df.drop_duplicates(subset = ['x','y']).shape[0]

6

In [63]:
data_dict = {'x': [0,0,0,1,1,1,2,2,2,4,4,4,4,5,5,5,5,6,6,6], 'y':[0,0,0,1,1,1,2,2,2,4,4,4,4,5,5,5,5,6,6,6], 'area': [23,34,21,34,56,45,212,432,543,232,332,21,34,345,54,76,87,34,23,87]}
#test_df = pd.DataFrame(data = data_dict)
#test_df
df_compact = pd.DataFrame(columns=['x','y','area'])
try:
    for ind in range (test_df.shape[0]): #.drop_duplicates(subset = ['x','y'])
        print('ind ' + str(ind))
        area_mean = test_df.loc[(test_df['x']== test_df.iloc[ind]['x']) & (test_df['y'] == test_df.iloc[ind]['y'])]['area'].mean().astype(int)
        df_compact = df_compact.append({'x': test_df.iloc[ind]['x'], 'y':test_df.iloc[ind]['y'], 'area':area_mean }, ignore_index=True) #, 'intensity':intensity_mean
        new_data_line = pd.DataFrame({'x': test_df.iloc[ind]['x'], 'y':test_df.iloc[ind]['y'], 'area':area_mean }, index=[0])
        print('df_compact')
        print(df_compact)
        test_df = test_df[(test_df['x']!= test_df.iloc[ind]['x']) & (test_df['y'] != test_df.iloc[ind]['y'])]
        print('after step 1')
        print(test_df)
        test_df = pd.concat([new_data_line, test_df.iloc[:]]).reset_index(drop=True)
        print('after step 2')
        print(test_df)
except:
    pass

ind 0
df_compact
   x  y area
0  0  0   26
after step 1
    x  y  area
3   1  1    34
4   1  1    56
5   1  1    45
6   2  2   212
7   2  2   432
8   2  2   543
9   4  4   232
10  4  4   332
11  4  4    21
12  4  4    34
13  5  5   345
14  5  5    54
15  5  5    76
16  5  5    87
17  6  6    34
18  6  6    23
19  6  6    87
after step 2
    x  y  area
0   0  0    26
1   1  1    34
2   1  1    56
3   1  1    45
4   2  2   212
5   2  2   432
6   2  2   543
7   4  4   232
8   4  4   332
9   4  4    21
10  4  4    34
11  5  5   345
12  5  5    54
13  5  5    76
14  5  5    87
15  6  6    34
16  6  6    23
17  6  6    87
ind 1
df_compact
   x  y area
0  0  0   26
1  1  1   45
after step 1
    x  y  area
0   0  0    26
4   2  2   212
5   2  2   432
6   2  2   543
7   4  4   232
8   4  4   332
9   4  4    21
10  4  4    34
11  5  5   345
12  5  5    54
13  5  5    76
14  5  5    87
15  6  6    34
16  6  6    23
17  6  6    87
after step 2
    x  y  area
0   1  1    45
1   0  0    26
2   2  2 

In [56]:
df_compact.drop_duplicates()

Unnamed: 0,x,y,area
0,0,0,26
3,1,1,45
6,2,2,395
9,4,4,154
13,5,5,140
17,6,6,48
22,1,1,40
23,2,2,306
24,4,4,185
25,5,5,149


In [37]:
df = pd.DataFrame({'name': ['jon','sam','jane','bob'],
           'age': [30,25,18,26],
           'sex':['male','male','female','male']})
# new line
line = pd.DataFrame({'name': 'dean', 'age': 45, 'sex': 'male'}, index=[0])
# concatenate two dataframe
df = pd.concat([line,df.ix[:]]).reset_index(drop=True)
print (df)

   name  age     sex
0  dean   45    male
1   jon   30    male
2   sam   25    male
3  jane   18  female
4   bob   26    male


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  import sys
