In [24]:
from joblib import load, dump
import pandas as pd
import numpy as np
import os
import itertools

In [2]:
def extract_data_from_file(output_file_path):
  
  output = load(output_file_path)
  
  combinations = [tuple[0] for tuple in output[0]]
  affinity_runtime_min = [output[1]/60 for tuple in output[0]]
  tsne_runtime_min = [tuple[2]/60 for tuple in output[0]]
  KL_divergences = [tuple[3] for tuple in output[0]]
  Pipeline_indices = [tuple[4] for tuple in output[0]]

  modified_tsne_dfs = []
  for tuple in output[0]:
    combination = tuple[0]
    df_tsne = tuple[1]
    df_tsne.columns = [f'tSNE1_{combination}', f'tSNE2_{combination}']
    modified_tsne_dfs.append(df_tsne)

  merged_tsnes_from_file = pd.concat(modified_tsne_dfs, axis=1)

  perplexity = [comb[0] for comb in combinations] # 5
  early_exaggeration = [comb[1] for comb in combinations]
  initial_momentum = [comb[2] for comb in combinations]
  final_momentum = [comb[3] for comb in combinations]
  theta = [comb[3] for comb in combinations]

  pipeline_metrics = pd.DataFrame({'Pipeline_index': Pipeline_indices,
                                 'Combination' : combinations,
                                 'Perplexity': perplexity,
                                 'Early_exaggeration': early_exaggeration,
                                 'Initial_momentum': initial_momentum,
                                 'Final_momentum': final_momentum,
                                 'Theta': theta,
                                 'tSNE_runtime_min': tsne_runtime_min,
                                 'Affinity_runtime_min': affinity_runtime_min,
                                 'KL_divergence': KL_divergences
                                 })

  return(merged_tsnes_from_file, pipeline_metrics)


In [3]:
# testing
out1 = extract_data_from_file('/home/luana/workspace/output/thesis/perp5/pipeline_multiples_perp5_0-60.joblib')
print(out1[0].shape)
print(out1[0].columns[:5])
print(out1[1][:5])
#output = load('/home/luana/workspace/output/thesis/perp5/pipeline_multiples_perp5_0-60.joblib')
#print(len(output[0]))

(39970, 120)
Index(['tSNE1_(5, 4, 0.1, 0.8, 0.25)', 'tSNE2_(5, 4, 0.1, 0.8, 0.25)',
       'tSNE1_(5, 4, 0.1, 0.9, 0.25)', 'tSNE2_(5, 4, 0.1, 0.9, 0.25)',
       'tSNE1_(5, 4, 0.1, 1.0, 0.25)'],
      dtype='object')
  Pipeline_index             Combination  Perplexity  Early_exaggeration  \
0     pipeline_0  (5, 4, 0.1, 0.8, 0.25)           5                   4   
1     pipeline_1  (5, 4, 0.1, 0.9, 0.25)           5                   4   
2     pipeline_2  (5, 4, 0.1, 1.0, 0.25)           5                   4   
3     pipeline_3  (5, 4, 0.3, 0.8, 0.25)           5                   4   
4     pipeline_4  (5, 4, 0.3, 0.9, 0.25)           5                   4   

   Initial_momentum  Final_momentum  Theta  tSNE_runtime_min  \
0               0.1             0.8    0.8         50.428161   
1               0.1             0.9    0.9         50.565599   
2               0.1             1.0    1.0         88.257916   
3               0.3             0.8    0.8         52.064660   
4     

In [4]:
def merge_output_from_folder(output_base_path, folder_relative_path = 'perp5'):
  output_files = os.listdir(os.path.join(output_base_path, folder_relative_path))
  tsne_maps_list = []
  pipeline_metrics_list = []
  for file in output_files:
    if file != 'README':
      full_file_path = os.path.join(output_base_path, folder_relative_path, file)
      data_from_file = extract_data_from_file(full_file_path)
      tsne_maps_list.append(data_from_file[0])
      pipeline_metrics_list.append(data_from_file[1])
     
  return(tsne_maps_list, pipeline_metrics_list)

In [5]:
# testing
output_path = '/home/luana/workspace/output/thesis'
out = merge_output_from_folder(output_base_path=output_path)
print(len(out) == 2) # 2 expected objects
print(len(out[0]) == 4) # 4 concatenated tsne maps, one for each file in folder perp5
print(len(out[1]) == 4) # 4 metrics dataframes, one for each file in folder perp5

True
True
True


In [6]:
# Extract data for all folders
folders = os.listdir(output_path)

all_tsne_maps = []
all_pipeline_metrics = []

for folder in folders:
   output_folder = merge_output_from_folder(output_base_path=output_path, folder_relative_path=folder)
   all_tsne_maps.append(output_folder[0])
   all_pipeline_metrics.append(output_folder[1])
   
all_tsne_maps = [tsne_df for tsne_df in itertools.chain(*all_tsne_maps)]
all_pipeline_metrics = [metrics_df for metrics_df in itertools.chain(*all_pipeline_metrics)]

In [7]:
nbr_files_folder = []
for folder in folders:
    nbr_files_folder.append(len(os.listdir(os.path.join(output_path, folder))))
    print(folder)
    
        
print(nbr_files_folder)
print(len(all_tsne_maps) == sum(nbr_files_folder) -1) # exclude 'README' in perp25
print(len(all_pipeline_metrics) == sum(nbr_files_folder) - 1)

perp65
perp45
perp25
perp5
[2, 3, 5, 4]
True
True


In [8]:
df_tsne_all = pd.concat(all_tsne_maps, axis = 1)
df_metrics_all = pd.concat(all_pipeline_metrics, axis = 0)

In [9]:
# checking dimensions
print(2*df_metrics_all.shape[0] == df_tsne_all.shape[1])

True


In [10]:
df_tsne_all

Unnamed: 0,"tSNE1_(65, 4, 0.1, 0.8, 0.25)","tSNE2_(65, 4, 0.1, 0.8, 0.25)","tSNE1_(65, 4, 0.1, 0.9, 0.25)","tSNE2_(65, 4, 0.1, 0.9, 0.25)","tSNE1_(65, 4, 0.1, 1.0, 0.25)","tSNE2_(65, 4, 0.1, 1.0, 0.25)","tSNE1_(65, 4, 0.3, 0.8, 0.25)","tSNE2_(65, 4, 0.3, 0.8, 0.25)","tSNE1_(65, 4, 0.3, 0.9, 0.25)","tSNE2_(65, 4, 0.3, 0.9, 0.25)",...,"tSNE1_(5, 25, 0.3, 0.8, 0.75)","tSNE2_(5, 25, 0.3, 0.8, 0.75)","tSNE1_(5, 25, 0.3, 0.9, 0.75)","tSNE2_(5, 25, 0.3, 0.9, 0.75)","tSNE1_(5, 25, 0.3, 1.0, 0.75)","tSNE2_(5, 25, 0.3, 1.0, 0.75)","tSNE1_(5, 25, 0.5, 0.8, 0.75)","tSNE2_(5, 25, 0.5, 0.8, 0.75)","tSNE1_(5, 25, 0.5, 0.9, 0.75)","tSNE2_(5, 25, 0.5, 0.9, 0.75)"
0,-17.852317,-42.415404,-17.852317,-42.415404,-17.852317,-42.415404,-17.852317,-42.415404,-17.852317,-42.415404,...,-9.517069,35.354478,-9.517069,35.354478,-9.517069,35.354478,-9.517069,35.354478,-9.517069,35.354478
1,-17.208992,-40.326899,-17.208992,-40.326899,-17.208992,-40.326899,-17.208992,-40.326899,-17.208992,-40.326899,...,-14.305251,32.414450,-14.305251,32.414450,-14.305251,32.414450,-14.305251,32.414450,-14.305251,32.414450
2,-26.494787,-36.328206,-26.494787,-36.328206,-26.494787,-36.328206,-26.494787,-36.328206,-26.494787,-36.328206,...,-20.083942,24.229093,-20.083942,24.229093,-20.083942,24.229093,-20.083942,24.229093,-20.083942,24.229093
3,-20.272979,-43.814987,-20.272979,-43.814987,-20.272979,-43.814987,-20.272979,-43.814987,-20.272979,-43.814987,...,-9.465496,32.511879,-9.465496,32.511879,-9.465496,32.511879,-9.465496,32.511879,-9.465496,32.511879
4,-26.128778,-34.066852,-26.128778,-34.066852,-26.128778,-34.066852,-26.128778,-34.066852,-26.128778,-34.066852,...,-24.737715,24.655410,-24.737715,24.655410,-24.737715,24.655410,-24.737715,24.655410,-24.737715,24.655410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39965,-15.502500,-14.079405,-15.502500,-14.079405,-15.502500,-14.079405,-15.502500,-14.079405,-15.502500,-14.079405,...,-7.115145,-31.166666,-7.115145,-31.166666,-7.115145,-31.166666,-7.115145,-31.166666,-7.115145,-31.166666
39966,-9.260072,4.940805,-9.260072,4.940805,-9.260072,4.940805,-9.260072,4.940805,-9.260072,4.940805,...,7.624906,-47.244021,7.624906,-47.244021,7.624906,-47.244021,7.624906,-47.244021,7.624906,-47.244021
39967,-9.818078,4.734473,-9.818078,4.734473,-9.818078,4.734473,-9.818078,4.734473,-9.818078,4.734473,...,4.636817,-49.683122,4.636817,-49.683122,4.636817,-49.683122,4.636817,-49.683122,4.636817,-49.683122
39968,9.530069,14.782672,9.530069,14.782672,9.530069,14.782672,9.530069,14.782672,9.530069,14.782672,...,24.287581,-36.879656,24.287581,-36.879656,24.287581,-36.879656,24.287581,-36.879656,24.287581,-36.879656


In [11]:
df_metrics_all

Unnamed: 0,Pipeline_index,Combination,Perplexity,Early_exaggeration,Initial_momentum,Final_momentum,Theta,tSNE_runtime_min,Affinity_runtime_min,KL_divergence
0,pipeline_0,"(65, 4, 0.1, 0.8, 0.25)",65,4,0.1,0.8,0.8,60.081908,12.199162,2.927480
1,pipeline_1,"(65, 4, 0.1, 0.9, 0.25)",65,4,0.1,0.9,0.9,58.739115,12.199162,2.927480
2,pipeline_2,"(65, 4, 0.1, 1.0, 0.25)",65,4,0.1,1.0,1.0,59.221802,12.199162,2.927480
3,pipeline_3,"(65, 4, 0.3, 0.8, 0.25)",65,4,0.3,0.8,0.8,58.614608,12.199162,2.927480
4,pipeline_4,"(65, 4, 0.3, 0.9, 0.25)",65,4,0.3,0.9,0.9,59.590521,12.199162,2.927480
...,...,...,...,...,...,...,...,...,...,...
39,pipeline_0,"(5, 25, 0.3, 0.8, 0.75)",5,25,0.3,0.8,0.8,10.114202,5.686386,3.825175
40,pipeline_0,"(5, 25, 0.3, 0.9, 0.75)",5,25,0.3,0.9,0.9,10.089153,5.686386,3.825175
41,pipeline_0,"(5, 25, 0.3, 1.0, 0.75)",5,25,0.3,1.0,1.0,10.105947,5.686386,3.825175
42,pipeline_0,"(5, 25, 0.5, 0.8, 0.75)",5,25,0.5,0.8,0.8,10.108778,5.686386,3.825175


In [12]:
# duplicated pipelines
combinations = df_metrics_all['Combination'].to_numpy()
unique_values, indices_unique = np.unique(combinations, return_index=True)
print(f'There are {len(combinations) - len(unique_values)} duplicated pipelines')
indices_duplicated = np.setdiff1d(np.arange(len(combinations)), indices_unique)
print(combinations[indices_duplicated[0]])
df_metrics_all[df_metrics_all['Combination'] == combinations[indices_duplicated[0]]]
#.isin(combinations[indices_duplicated[0]]))

There are 43 duplicated pipelines
(5, 18, 0.1, 0.8, 0.75)


Unnamed: 0,Pipeline_index,Combination,Perplexity,Early_exaggeration,Initial_momentum,Final_momentum,Theta,tSNE_runtime_min,Affinity_runtime_min,KL_divergence
30,pipeline_0,"(5, 18, 0.1, 0.8, 0.75)",5,18,0.1,0.8,0.8,9.848447,5.676407,3.818215
0,pipeline_0,"(5, 18, 0.1, 0.8, 0.75)",5,18,0.1,0.8,0.8,10.03258,5.686386,3.816771


In [13]:
df_metrics_all.iloc[indices_unique]

Unnamed: 0,Pipeline_index,Combination,Perplexity,Early_exaggeration,Initial_momentum,Final_momentum,Theta,tSNE_runtime_min,Affinity_runtime_min,KL_divergence
0,pipeline_0,"(5, 4, 0.1, 0.8, 0.25)",5,4,0.1,0.8,0.8,50.428161,5.632579,3.818212
9,pipeline_9,"(5, 4, 0.1, 0.8, 0.5)",5,4,0.1,0.8,0.8,17.064618,5.632579,3.816298
18,pipeline_18,"(5, 4, 0.1, 0.8, 0.75)",5,4,0.1,0.8,0.8,9.227857,5.632579,3.832424
27,pipeline_27,"(5, 4, 0.1, 0.8, 1.0)",5,4,0.1,0.8,0.8,6.147008,5.632579,3.847199
1,pipeline_1,"(5, 4, 0.1, 0.9, 0.25)",5,4,0.1,0.9,0.9,50.565599,5.632579,3.818212
...,...,...,...,...,...,...,...,...,...,...
15,pipeline_15,"(65, 4, 0.5, 1.0, 1.0)",65,4,0.5,1.0,1.0,8.170208,12.298686,2.948618
16,pipeline_16,"(65, 11, 0.1, 0.8, 0.25)",65,11,0.1,0.8,0.8,60.704797,12.298686,2.936716
17,pipeline_17,"(65, 11, 0.1, 0.9, 0.25)",65,11,0.1,0.9,0.9,81.211638,12.298686,2.942396
18,pipeline_18,"(65, 11, 0.1, 1.0, 0.25)",65,11,0.1,1.0,1.0,61.160962,12.298686,2.942396


In [14]:
# checking if the filter is correct
all(df_metrics_all.iloc[indices_unique]['Combination'].to_numpy() == combinations[indices_unique])

True

In [15]:
df_metrics_unique = df_metrics_all.iloc[indices_unique].sort_values(by=['Perplexity', 'Perplexity', 'Early_exaggeration', 'Initial_momentum', 'Final_momentum', 'Theta'])
df_metrics_unique['Pipeline_index'] = [f'pipeline_{i}' for i in np.arange(len(df_metrics_unique))]
df_metrics_unique

Unnamed: 0,Pipeline_index,Combination,Perplexity,Early_exaggeration,Initial_momentum,Final_momentum,Theta,tSNE_runtime_min,Affinity_runtime_min,KL_divergence
0,pipeline_0,"(5, 4, 0.1, 0.8, 0.25)",5,4,0.1,0.8,0.8,50.428161,5.632579,3.818212
9,pipeline_1,"(5, 4, 0.1, 0.8, 0.5)",5,4,0.1,0.8,0.8,17.064618,5.632579,3.816298
18,pipeline_2,"(5, 4, 0.1, 0.8, 0.75)",5,4,0.1,0.8,0.8,9.227857,5.632579,3.832424
27,pipeline_3,"(5, 4, 0.1, 0.8, 1.0)",5,4,0.1,0.8,0.8,6.147008,5.632579,3.847199
1,pipeline_4,"(5, 4, 0.1, 0.9, 0.25)",5,4,0.1,0.9,0.9,50.565599,5.632579,3.818212
...,...,...,...,...,...,...,...,...,...,...
15,pipeline_542,"(65, 4, 0.5, 1.0, 1.0)",65,4,0.5,1.0,1.0,8.170208,12.298686,2.948618
16,pipeline_543,"(65, 11, 0.1, 0.8, 0.25)",65,11,0.1,0.8,0.8,60.704797,12.298686,2.936716
17,pipeline_544,"(65, 11, 0.1, 0.9, 0.25)",65,11,0.1,0.9,0.9,81.211638,12.298686,2.942396
18,pipeline_545,"(65, 11, 0.1, 1.0, 0.25)",65,11,0.1,1.0,1.0,61.160962,12.298686,2.942396


In [16]:
columns_tsne = df_tsne_all.columns.to_numpy()
unique_values, indices_unique = np.unique(columns_tsne, return_index=True)
print(f'There are {len(columns_tsne) - len(unique_values)} duplicated pipelines')
print('The number of duplicated columns is twice the number of duplicated combinations:', len(columns_tsne) - len(unique_values) == 2*len(indices_duplicated))

There are 86 duplicated pipelines
The number of duplicated columns is twice the number of duplicated combinations: True


In [17]:
df_tsne_unique = df_tsne_all.iloc[:,indices_unique]
df_tsne_unique

Unnamed: 0,"tSNE1_(25, 11, 0.1, 0.8, 0.25)","tSNE1_(25, 11, 0.1, 0.8, 0.75)","tSNE1_(25, 11, 0.1, 0.8, 1.0)","tSNE1_(25, 11, 0.1, 0.9, 0.25)","tSNE1_(25, 11, 0.1, 0.9, 0.5)","tSNE1_(25, 11, 0.1, 0.9, 0.75)","tSNE1_(25, 11, 0.1, 0.9, 1.0)","tSNE1_(25, 11, 0.1, 1.0, 0.25)","tSNE1_(25, 11, 0.1, 1.0, 0.5)","tSNE1_(25, 11, 0.1, 1.0, 0.75)",...,"tSNE2_(65, 4, 0.5, 0.8, 0.75)","tSNE2_(65, 4, 0.5, 0.8, 1.0)","tSNE2_(65, 4, 0.5, 0.9, 0.25)","tSNE2_(65, 4, 0.5, 0.9, 0.5)","tSNE2_(65, 4, 0.5, 0.9, 0.75)","tSNE2_(65, 4, 0.5, 0.9, 1.0)","tSNE2_(65, 4, 0.5, 1.0, 0.25)","tSNE2_(65, 4, 0.5, 1.0, 0.5)","tSNE2_(65, 4, 0.5, 1.0, 0.75)","tSNE2_(65, 4, 0.5, 1.0, 1.0)"
0,3.835665,1.217995,5.030805,4.839733,14.601266,1.217995,5.030805,4.839733,10.940853,1.217995,...,-34.610516,-25.323755,-42.415404,-38.348899,-34.610516,-25.323755,-42.415404,-38.348899,-34.610516,-25.323755
1,0.645905,-0.563003,3.257363,2.113803,12.183838,-0.563003,3.257363,2.113803,7.525160,-0.563003,...,-33.115479,-23.950751,-40.326899,-36.475835,-33.115479,-23.950751,-40.326899,-36.475835,-33.115479,-23.950751
2,-10.001225,-10.063546,-4.444393,-8.986851,2.254256,-10.063546,-4.444393,-8.986851,-0.588714,-10.063546,...,-30.317848,-23.003356,-36.328206,-32.331613,-30.317848,-23.003356,-36.328206,-32.331613,-30.317848,-23.003356
3,2.470373,-0.268270,4.258074,3.585484,13.354043,-0.268270,4.258074,3.585484,10.372096,-0.268270,...,-35.856266,-26.417167,-43.814987,-39.256925,-35.856266,-26.417167,-43.814987,-39.256925,-35.856266,-26.417167
4,-10.965291,-10.370815,-5.121404,-9.892124,1.013271,-10.370815,-5.121404,-9.892124,-2.265408,-10.370815,...,-28.530821,-21.803602,-34.066852,-30.355760,-28.530821,-21.803602,-34.066852,-30.355760,-28.530821,-21.803602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39965,-31.906950,-28.268363,-23.492579,-32.282419,-31.590006,-28.268363,-23.492579,-32.282419,-30.727356,-28.268363,...,-12.782724,-8.281367,-14.079405,-12.920415,-12.782724,-8.281367,-14.079405,-12.920415,-12.782724,-8.281367
39966,-18.857990,-17.023031,-13.628076,-20.291223,-18.898117,-17.023031,-13.628076,-20.291223,-18.325749,-17.023031,...,2.796312,3.597627,4.940805,4.967872,2.796312,3.597627,4.940805,4.967872,2.796312,3.597627
39967,-15.994633,-14.793017,-11.475492,-17.598491,-16.355368,-14.793017,-11.475492,-17.598491,-15.506744,-14.793017,...,2.467400,2.785051,4.734473,4.766168,2.467400,2.785051,4.734473,4.766168,2.467400,2.785051
39968,-5.628846,-5.949123,-4.548753,-6.768127,-5.077352,-5.949123,-4.548753,-6.768127,-2.513102,-5.949123,...,10.093991,10.388645,14.782672,13.615082,10.093991,10.388645,14.782672,13.615082,10.093991,10.388645


In [18]:
# checking final dimensions
print(2*df_metrics_unique.shape[0] == df_tsne_unique.shape[1])

True


In [21]:
# Export csv
df_tsne_unique.to_csv('../output/df_tsne_unique.csv', index=False, compression='gzip')
df_metrics_unique.to_csv('../output/df_metrics_unique.csv', index=False)

In [22]:
# Load files
#df_tsne = pd.read_csv('../output/df_tsne_unique.csv', compression='gzip')
#print(df_tsne.shape)

(39970, 1094)


In [26]:
# Export joblib
#dump(df_tsne_unique, '../output/df_tsne_unique.joblib.gz', compress=('gzip', 3))
#dump(df_metrics_unique, '../output/df_metrics_unique.joblib.gz', compress=('gzip', 3))

['../output/df_metrics_unique.joblib.gz']

In [28]:
# Load filed
#df_tsne_unique_read = load('../output/df_tsne_unique.joblib.gz')
#print(df_tsne_unique_read.shape)

(39970, 1094)
