In [1]:
from joblib import load, dump
import pandas as pd
import numpy as np
import os

In [3]:
def extract_data_from_file(output_file_path):
  
  output = load(output_file_path)
  
  combinations = [tuple[0] for tuple in output]
  tsne_runtime_min = [tuple[2]/60 for tuple in output]
  KL_divergences = [tuple[3] for tuple in output]
  #Pipeline_indices = [tuple[3] for tuple in output]

  modified_tsne_dfs = []
  for tuple in output:
    combination = tuple[0]
    df_tsne = tuple[1]
    df_tsne.columns = [f'tSNE1_{combination}', f'tSNE2_{combination}']
    modified_tsne_dfs.append(df_tsne)

  merged_tsnes_from_file = pd.concat(modified_tsne_dfs, axis=1)

  perplexity = [comb[0] for comb in combinations] # 5
  early_exaggeration = [comb[1] for comb in combinations]
  initial_momentum = [comb[2] for comb in combinations]
  final_momentum = [comb[3] for comb in combinations]
  theta = [comb[4] for comb in combinations]

  pipeline_metrics = pd.DataFrame({
                                 'Combination' : combinations,
                                 'Perplexity': perplexity,
                                 'Early_exaggeration': early_exaggeration,
                                 'Initial_momentum': initial_momentum,
                                 'Final_momentum': final_momentum,
                                 'Theta': theta,
                                 'tSNE_runtime_min': tsne_runtime_min,
                                 'KL_divergence': KL_divergences
                                 })

  return(merged_tsnes_from_file, pipeline_metrics)


In [19]:
output_dir = os.path.join(os.path.dirname(os.getcwd()), "output")
files_in_output_dir = os.listdir(output_dir)
tsne_output_fnames = [file for file in files_in_output_dir if not file.startswith("parameter")]
tsne_output_fnames

['pipeline_multiples_0-100_sampled--False_new.joblib',
 'pipeline_multiples_300-460_sampled--False_new.joblib',
 'pipeline_multiples_100-300_sampled--False_new.joblib']

In [21]:
def merge_output_from_folder(tsne_output_fnames, output_dir):
  tsne_maps_list = []
  pipeline_metrics_list = []
  for file in tsne_output_fnames:
    file = os.path.join(output_dir, file)
    print(f"Processing {file}")
    data_from_file = extract_data_from_file(file)
    tsne_maps_list.append(data_from_file[0])
    pipeline_metrics_list.append(data_from_file[1])
     
  return(tsne_maps_list, pipeline_metrics_list)

In [None]:
output_folder = merge_output_from_folder(tsne_output_fnames, output_dir)

Processing /home/luana/workspace/thesis_python/output/pipeline_multiples_0-100_sampled--False_new.joblib
Processing /home/luana/workspace/thesis_python/output/pipeline_multiples_300-460_sampled--False_new.joblib
Processing /home/luana/workspace/thesis_python/output/pipeline_multiples_100-300_sampled--False_new.joblib


In [25]:
tsne_maps_list = output_folder[0]
pipeline_metrics_list = output_folder[1]

print(len(tsne_maps_list))
print(tsne_maps_list[0].shape)
print(len(pipeline_metrics_list))
print(pipeline_metrics_list[0].shape)

3
(5000, 200)
3
(100, 8)


In [28]:
df_tsne_all = pd.concat(tsne_maps_list, axis = 1)
print(df_tsne_all.shape)

df_metrics_all = pd.concat(pipeline_metrics_list, axis = 0)
print(df_metrics_all.shape)
print(df_metrics_all.head())

(5000, 920)
(460, 8)
                                         Combination  Perplexity  \
0  (63.487394957983184, 7.0588235294117645, 0.200...   63.487395   
1  (75.67226890756302, 10.117647058823529, 0.3554...   75.672269   
2  (43.99159663865546, 4.9411764705882355, 0.4428...   43.991597   
3  (25.71428571428571, 4.235294117647059, 0.35546...   25.714286   
4  (34.24369747899159, 20.470588235294116, 0.3319...   34.243697   

   Early_exaggeration  Initial_momentum  Final_momentum  Theta  \
0            7.058824          0.200840        0.800000   0.12   
1           10.117647          0.355462        0.853782   0.40   
2            4.941176          0.442857        0.867227   0.88   
3            4.235294          0.355462        0.805042   0.40   
4           20.470588          0.331933        0.994958   0.43   

   tSNE_runtime_min  KL_divergence  
0         12.266611       2.564458  
1          3.185728       2.494597  
2          1.617206       2.820891  
3          2.972111      

In [29]:
# checking dimensions
print(2*df_metrics_all.shape[0] == df_tsne_all.shape[1])

True


In [30]:
df_tsne_all

Unnamed: 0,"tSNE1_(63.487394957983184, 7.0588235294117645, 0.2008403361344538, 0.8, 0.12)","tSNE2_(63.487394957983184, 7.0588235294117645, 0.2008403361344538, 0.8, 0.12)","tSNE1_(75.67226890756302, 10.117647058823529, 0.35546218487394965, 0.8537815126050421, 0.4)","tSNE2_(75.67226890756302, 10.117647058823529, 0.35546218487394965, 0.8537815126050421, 0.4)","tSNE1_(43.99159663865546, 4.9411764705882355, 0.44285714285714284, 0.8672268907563025, 0.88)","tSNE2_(43.99159663865546, 4.9411764705882355, 0.44285714285714284, 0.8672268907563025, 0.88)","tSNE1_(25.71428571428571, 4.235294117647059, 0.35546218487394965, 0.8050420168067227, 0.4)","tSNE2_(25.71428571428571, 4.235294117647059, 0.35546218487394965, 0.8050420168067227, 0.4)","tSNE1_(34.24369747899159, 20.470588235294116, 0.33193277310924374, 0.9949579831932773, 0.43)","tSNE2_(34.24369747899159, 20.470588235294116, 0.33193277310924374, 0.9949579831932773, 0.43)",...,"tSNE1_(106.13445378151259, 19.764705882352942, 0.12016806722689076, 0.8890756302521009, 0.98)","tSNE2_(106.13445378151259, 19.764705882352942, 0.12016806722689076, 0.8890756302521009, 0.98)","tSNE1_(48.86554621848739, 7.294117647058823, 0.40588235294117647, 0.9176470588235295, 0.29)","tSNE2_(48.86554621848739, 7.294117647058823, 0.40588235294117647, 0.9176470588235295, 0.29)","tSNE1_(125.63025210084032, 18.588235294117645, 0.22436974789915967, 0.8991596638655462, 0.22)","tSNE2_(125.63025210084032, 18.588235294117645, 0.22436974789915967, 0.8991596638655462, 0.22)","tSNE1_(9.873949579831933, 23.294117647058822, 0.18403361344537816, 0.9025210084033614, 0.65)","tSNE2_(9.873949579831933, 23.294117647058822, 0.18403361344537816, 0.9025210084033614, 0.65)","tSNE1_(76.890756302521, 26.588235294117645, 0.30840336134453783, 0.9411764705882353, 0.97)","tSNE2_(76.890756302521, 26.588235294117645, 0.30840336134453783, 0.9411764705882353, 0.97)"
0,-0.256665,3.599885,-1.011557,2.475599,-3.342591,-0.865847,-3.690811,0.925958,802.786451,-853.210289,...,1.349553,-1.200859,-4.618703,-1.496852,-1.716219,1.236932,-29.710792,12.033866,-4.838944,-0.030026
1,2.547580,-11.835203,-3.898642,-9.051478,13.555174,0.840472,18.824165,4.243612,-807.264817,1721.220161,...,-6.152406,0.870245,17.095935,0.148064,8.599384,1.973948,-28.683412,6.438817,6.719333,4.117654
2,-4.958115,-4.065024,-4.058826,-8.482615,13.487370,1.024189,18.657472,4.152325,-734.575441,1790.329286,...,-5.898257,0.481237,17.010859,0.393492,8.357504,2.182458,-28.460441,6.381593,5.781159,4.567381
3,2.767360,1.469839,5.319700,0.641766,0.206652,-1.405085,-0.603393,2.728124,288.049675,260.589927,...,1.737276,-1.504709,-0.066911,-0.656979,-1.892022,1.110934,-1.917996,1.339508,0.033226,2.892997
4,2.502670,-11.171802,-3.609499,-8.627820,12.976409,0.869084,17.559176,4.701734,1353.973565,1404.196355,...,-5.889770,1.259754,16.234935,0.061788,8.212077,1.653285,-27.379308,6.223015,6.892627,3.339524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3.856321,-7.060350,-1.664163,-7.737303,8.249131,-1.673438,12.099325,3.181017,-185.655686,1024.911050,...,-5.208994,5.358784,9.548300,-3.954985,6.623403,0.002397,-23.833870,-8.862120,7.928454,1.150594
4996,-1.045785,9.523487,-1.792322,8.886177,-10.559362,1.810592,-11.193431,6.613696,-1262.478870,-1121.423335,...,1.197536,-5.921238,-12.418440,5.369932,-2.813488,4.844286,14.695378,18.860284,-8.132632,-5.363932
4997,-1.578971,7.182898,3.327387,2.940088,-4.005981,-1.809106,-5.406666,-2.098669,1121.189838,-1112.096535,...,3.380227,-1.340083,-5.390062,-2.844686,-4.938192,-2.370156,16.730800,8.198074,-8.780618,-0.526092
4998,2.804172,9.271810,0.567324,10.600642,-2.187665,-4.973592,-3.432335,-5.312708,631.980240,-506.226632,...,5.738983,-5.176733,-15.475691,3.037440,-7.141641,4.623291,26.250948,13.088191,-11.415144,1.485965


In [31]:
df_metrics_all

Unnamed: 0,Combination,Perplexity,Early_exaggeration,Initial_momentum,Final_momentum,Theta,tSNE_runtime_min,KL_divergence
0,"(63.487394957983184, 7.0588235294117645, 0.200...",63.487395,7.058824,0.200840,0.800000,0.12,12.266611,2.564458
1,"(75.67226890756302, 10.117647058823529, 0.3554...",75.672269,10.117647,0.355462,0.853782,0.40,3.185728,2.494597
2,"(43.99159663865546, 4.9411764705882355, 0.4428...",43.991597,4.941176,0.442857,0.867227,0.88,1.617206,2.820891
3,"(25.71428571428571, 4.235294117647059, 0.35546...",25.714286,4.235294,0.355462,0.805042,0.40,2.972111,3.071129
4,"(34.24369747899159, 20.470588235294116, 0.3319...",34.243697,20.470588,0.331933,0.994958,0.43,2.870700,4.016433
...,...,...,...,...,...,...,...,...
195,"(106.13445378151259, 19.764705882352942, 0.120...",106.134454,19.764706,0.120168,0.889076,0.98,1.832171,2.247465
196,"(48.86554621848739, 7.294117647058823, 0.40588...",48.865546,7.294118,0.405882,0.917647,0.29,4.255687,2.775530
197,"(125.63025210084032, 18.588235294117645, 0.224...",125.630252,18.588235,0.224370,0.899160,0.22,6.886167,2.141492
198,"(9.873949579831933, 23.294117647058822, 0.1840...",9.873950,23.294118,0.184034,0.902521,0.65,1.879079,3.527255


In [32]:
# duplicated pipelines
KL_divergence = df_metrics_all['KL_divergence']
unique_KL_values = np.unique(KL_divergence)
print(f'There are {len(KL_divergence) - len(unique_KL_values)} duplicated pipelines/KL values')

There are 0 duplicated pipelines


In [33]:
# Export joblib
dump(df_tsne_all, '../output/df_tsne_final.joblib.gz', compress=('gzip', 3))
dump(df_metrics_all, '../output/df_metrics_final.joblib.gz', compress=('gzip', 3))

['../output/df_metrics_final.joblib.gz']