In [35]:
from joblib import load, dump
import pandas as pd
import numpy as np
import os

In [36]:
def extract_data_from_file(output_file_path):
  
  output = load(output_file_path)
  
  combinations = [tuple[0] for tuple in output]
  tsne_runtime_min = [tuple[2]/60 for tuple in output]
  KL_divergences = [tuple[3] for tuple in output]
  #Pipeline_indices = [tuple[3] for tuple in output]

  modified_tsne_dfs = []
  for tuple in output:
    combination = tuple[0]
    df_tsne = tuple[1]
    df_tsne.columns = [f'tSNE1_{combination}', f'tSNE2_{combination}']
    modified_tsne_dfs.append(df_tsne)

  merged_tsnes_from_file = pd.concat(modified_tsne_dfs, axis=1)

  perplexity = [comb[0] for comb in combinations] # 5
  early_exaggeration = [comb[1] for comb in combinations]
  initial_momentum = [comb[2] for comb in combinations]
  final_momentum = [comb[3] for comb in combinations]
  theta = [comb[4] for comb in combinations]

  pipeline_metrics = pd.DataFrame({
                                 'Combination' : combinations,
                                 'Perplexity': perplexity,
                                 'Early_exaggeration': early_exaggeration,
                                 'Initial_momentum': initial_momentum,
                                 'Final_momentum': final_momentum,
                                 'Theta': theta,
                                 'tSNE_runtime_min': tsne_runtime_min,
                                 'KL_divergence': KL_divergences
                                 })

  return(merged_tsnes_from_file, pipeline_metrics)


In [42]:
output_dir = os.path.join(os.path.dirname(os.getcwd()), "output")
files_in_output_dir = os.listdir(output_dir)
tsne_output_fnames = [file for file in files_in_output_dir if not file.startswith("parameter")]
tsne_output_fnames

['pipeline_multiples_0-100_sampled--False_new.joblib',
 'pipeline_multiples_300-460_sampled--False_new.joblib',
 'pipeline_multiples_100-300_sampled--False_new.joblib',
 'pipeline_multiples_460-560_sampled--False_new.joblib']

In [40]:
def merge_output_from_folder(tsne_output_fnames, output_dir):
  tsne_maps_list = []
  pipeline_metrics_list = []
  for file in tsne_output_fnames:
    file = os.path.join(output_dir, file)
    print(f"Processing {file}")
    data_from_file = extract_data_from_file(file)
    tsne_maps_list.append(data_from_file[0])
    pipeline_metrics_list.append(data_from_file[1])
     
  return(tsne_maps_list, pipeline_metrics_list)

In [44]:
output_folder = merge_output_from_folder(tsne_output_fnames, output_dir)

Processing /home/luana/workspace/thesis_python/output/pipeline_multiples_0-100_sampled--False_new.joblib
Processing /home/luana/workspace/thesis_python/output/pipeline_multiples_300-460_sampled--False_new.joblib
Processing /home/luana/workspace/thesis_python/output/pipeline_multiples_100-300_sampled--False_new.joblib
Processing /home/luana/workspace/thesis_python/output/pipeline_multiples_460-560_sampled--False_new.joblib


In [45]:
tsne_maps_list = output_folder[0]
pipeline_metrics_list = output_folder[1]

print(len(tsne_maps_list))
print(tsne_maps_list[0].shape)
print(len(pipeline_metrics_list))
print(pipeline_metrics_list[0].shape)

4
(5000, 200)
4
(100, 8)


In [47]:
df_tsne_all = pd.concat(tsne_maps_list, axis = 1)
print(df_tsne_all.shape)

df_metrics_all = pd.concat(pipeline_metrics_list, axis = 0)
print(df_metrics_all.shape)
print(df_metrics_all.head())

(5000, 1070)
(535, 8)
                                         Combination  Perplexity  \
0  (63.487394957983184, 7.0588235294117645, 0.200...   63.487395   
1  (75.67226890756302, 10.117647058823529, 0.3554...   75.672269   
2  (43.99159663865546, 4.9411764705882355, 0.4428...   43.991597   
3  (25.71428571428571, 4.235294117647059, 0.35546...   25.714286   
4  (34.24369747899159, 20.470588235294116, 0.3319...   34.243697   

   Early_exaggeration  Initial_momentum  Final_momentum  Theta  \
0            7.058824          0.200840        0.800000   0.12   
1           10.117647          0.355462        0.853782   0.40   
2            4.941176          0.442857        0.867227   0.88   
3            4.235294          0.355462        0.805042   0.40   
4           20.470588          0.331933        0.994958   0.43   

   tSNE_runtime_min  KL_divergence  
0         12.266611       2.564458  
1          3.185728       2.494597  
2          1.617206       2.820891  
3          2.972111     

In [48]:
# checking dimensions
print(2*df_metrics_all.shape[0] == df_tsne_all.shape[1])

True


In [49]:
df_tsne_all

Unnamed: 0,"tSNE1_(63.487394957983184, 7.0588235294117645, 0.2008403361344538, 0.8, 0.12)","tSNE2_(63.487394957983184, 7.0588235294117645, 0.2008403361344538, 0.8, 0.12)","tSNE1_(75.67226890756302, 10.117647058823529, 0.35546218487394965, 0.8537815126050421, 0.4)","tSNE2_(75.67226890756302, 10.117647058823529, 0.35546218487394965, 0.8537815126050421, 0.4)","tSNE1_(43.99159663865546, 4.9411764705882355, 0.44285714285714284, 0.8672268907563025, 0.88)","tSNE2_(43.99159663865546, 4.9411764705882355, 0.44285714285714284, 0.8672268907563025, 0.88)","tSNE1_(25.71428571428571, 4.235294117647059, 0.35546218487394965, 0.8050420168067227, 0.4)","tSNE2_(25.71428571428571, 4.235294117647059, 0.35546218487394965, 0.8050420168067227, 0.4)","tSNE1_(34.24369747899159, 20.470588235294116, 0.33193277310924374, 0.9949579831932773, 0.43)","tSNE2_(34.24369747899159, 20.470588235294116, 0.33193277310924374, 0.9949579831932773, 0.43)",...,"tSNE1_(117.10084033613444, 10.823529411764707, 0.23109243697478993, 0.8302521008403362, 0.13)","tSNE2_(117.10084033613444, 10.823529411764707, 0.23109243697478993, 0.8302521008403362, 0.13)","tSNE1_(123.19327731092436, 20.941176470588236, 0.31848739495798317, 0.9949579831932773, 0.21)","tSNE2_(123.19327731092436, 20.941176470588236, 0.31848739495798317, 0.9949579831932773, 0.21)","tSNE1_(84.2016806722689, 20.235294117647058, 0.48991596638655466, 0.9546218487394958, 0.23)","tSNE2_(84.2016806722689, 20.235294117647058, 0.48991596638655466, 0.9546218487394958, 0.23)","tSNE1_(28.15126050420168, 6.352941176470589, 0.44621848739495806, 0.8621848739495799, 0.61)","tSNE2_(28.15126050420168, 6.352941176470589, 0.44621848739495806, 0.8621848739495799, 0.61)","tSNE1_(120.7563025210084, 30.352941176470587, 0.24453781512605044, 0.9697478991596639, 0.29)","tSNE2_(120.7563025210084, 30.352941176470587, 0.24453781512605044, 0.9697478991596639, 0.29)"
0,-0.256665,3.599885,-1.011557,2.475599,-3.342591,-0.865847,-3.690811,0.925958,802.786451,-853.210289,...,0.234112,2.044436,-835.493876,-83.541859,1.871910,5.887343,-4.507182,1.275272,-37.863790,-40.749473
1,2.547580,-11.835203,-3.898642,-9.051478,13.555174,0.840472,18.824165,4.243612,-807.264817,1721.220161,...,3.534535,-6.949399,1769.630210,1290.358919,-4.142026,-23.169723,14.463852,7.901721,96.000379,-53.634126
2,-4.958115,-4.065024,-4.058826,-8.482615,13.487370,1.024189,18.657472,4.152325,-734.575441,1790.329286,...,3.638833,-6.670967,841.668518,1114.405082,-4.831756,-23.122070,14.363154,7.763593,93.493888,-56.638105
3,2.767360,1.469839,5.319700,0.641766,0.206652,-1.405085,-0.603393,2.728124,288.049675,260.589927,...,0.288439,2.494085,-827.408943,789.423868,1.674246,0.373266,-1.188026,4.162948,-12.093126,4.288185
4,2.502670,-11.171802,-3.609499,-8.627820,12.976409,0.869084,17.559176,4.701734,1353.973565,1404.196355,...,3.135579,-6.675105,1407.678066,-911.117840,-4.240927,-22.354737,13.188907,7.487783,99.526131,-39.728850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3.856321,-7.060350,-1.664163,-7.737303,8.249131,-1.673438,12.099325,3.181017,-185.655686,1024.911050,...,1.204700,-6.025826,1476.769104,-783.233635,-3.858087,-17.914316,11.492614,3.168198,122.692942,55.672496
4996,-1.045785,9.523487,-1.792322,8.886177,-10.559362,1.810592,-11.193431,6.613696,-1262.478870,-1121.423335,...,-2.780496,6.140914,-1348.950629,1319.042256,-7.641735,13.400365,-13.878690,4.486712,-82.159429,-98.017479
4997,-1.578971,7.182898,3.327387,2.940088,-4.005981,-1.809106,-5.406666,-2.098669,1121.189838,-1112.096535,...,-2.701400,3.750419,-1313.830480,-1548.193164,10.319850,10.471519,-15.660220,-1.336293,-103.900904,1.963145
4998,2.804172,9.271810,0.567324,10.600642,-2.187665,-4.973592,-3.432335,-5.312708,631.980240,-506.226632,...,2.766233,7.682302,-246.768102,457.986444,-9.672195,5.649242,-10.280328,9.809877,-113.992240,-57.615221


In [50]:
df_metrics_all

Unnamed: 0,Combination,Perplexity,Early_exaggeration,Initial_momentum,Final_momentum,Theta,tSNE_runtime_min,KL_divergence
0,"(63.487394957983184, 7.0588235294117645, 0.200...",63.487395,7.058824,0.200840,0.800000,0.12,12.266611,2.564458
1,"(75.67226890756302, 10.117647058823529, 0.3554...",75.672269,10.117647,0.355462,0.853782,0.40,3.185728,2.494597
2,"(43.99159663865546, 4.9411764705882355, 0.4428...",43.991597,4.941176,0.442857,0.867227,0.88,1.617206,2.820891
3,"(25.71428571428571, 4.235294117647059, 0.35546...",25.714286,4.235294,0.355462,0.805042,0.40,2.972111,3.071129
4,"(34.24369747899159, 20.470588235294116, 0.3319...",34.243697,20.470588,0.331933,0.994958,0.43,2.870700,4.016433
...,...,...,...,...,...,...,...,...
70,"(117.10084033613444, 10.823529411764707, 0.231...",117.100840,10.823529,0.231092,0.830252,0.13,12.545121,2.147929
71,"(123.19327731092436, 20.941176470588236, 0.318...",123.193277,20.941176,0.318487,0.994958,0.21,7.188163,3.374828
72,"(84.2016806722689, 20.235294117647058, 0.48991...",84.201681,20.235294,0.489916,0.954622,0.23,6.545434,2.504608
73,"(28.15126050420168, 6.352941176470589, 0.44621...",28.151261,6.352941,0.446218,0.862185,0.61,2.009543,3.044484


In [51]:
# duplicated pipelines
KL_divergence = df_metrics_all['KL_divergence']
unique_KL_values = np.unique(KL_divergence)
print(f'There are {len(KL_divergence) - len(unique_KL_values)} duplicated pipelines/KL values')

There are 0 duplicated pipelines/KL values


In [52]:
# Export joblib
dump(df_tsne_all, '../output/df_tsne_final.joblib.gz', compress=('gzip', 3))
dump(df_metrics_all, '../output/df_metrics_final.joblib.gz', compress=('gzip', 3))

['../output/df_metrics_final.joblib.gz']