<a href="https://colab.research.google.com/github/mtsizh/galaxy-morphology-manifold-learning/blob/main/find_best_reduction_parameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

If you performed dataset curation on your own - upload `curated_imgs.zip` and skip to the next step. Otherwise you can run the following code and download the curated dataseet from GitHub.

In [9]:
!wget -q https://raw.githubusercontent.com/mtsizh/galaxy-morphology-manifold-learning/main/curated_dataset/curated_imgs_multipart.zip && echo "HEAD dowloaded" || "ERROR downloading HEAD"

for i in range(1,8):
  !wget -q https://raw.githubusercontent.com/mtsizh/galaxy-morphology-manifold-learning/main/curated_dataset/curated_imgs_multipart.z0{i}  && echo "PART {i} of 7 OK" || "ERROR downloading PART {i}"

print('MERGING PARTS')
!zip -FF curated_imgs_multipart.zip --out curated_imgs.zip > /dev/null && echo "COMPLETE" || "FAILED"


HEAD dowloaded
PART 1 of 7 OK
PART 2 of 7 OK
PART 3 of 7 OK
PART 4 of 7 OK
PART 5 of 7 OK
PART 6 of 7 OK
PART 7 of 7 OK
MERGING PARTS
COMPLETE


Unzip the curated dataset.

In [10]:
!unzip -q -o curated_imgs.zip && echo "UNZIPPED" || "FAIL"

UNZIPPED


Few libraries are not installed by default. the following code installs `optuna` and `umap-learn`.

In [11]:
!pip install optuna
!pip install umap-learn

try:
  import optuna
  import umap
  from google.colab import output
  output.clear()
except:
  print('ERROR')
finally:
  print('COMPLETE')

COMPLETE


Run the following code to generate a report on different methods. `optuna` is used to get the best parameters for each of the methods: t-SNE, uMap, IsoMap, LLE, PCA. Result is saved in form of a `json` file.

In [None]:
import optuna
import numpy as np
from sklearn.manifold import TSNE, LocallyLinearEmbedding, Isomap
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from PIL import Image
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import json
import pprint


# use different class maps to get different estimations
class_map = {1: 'round', 2: 'inbetween', 3: 'cigar'}
#class_map = {4: 'edge on', 5: 'edge off'}
#class_map = {6: 'smooth', 7: 'featured'}
methods = ['t-SNE', 'LLE', 'Isomap', 'PCA', 'uMap']
n_bootstrap_samples = 50
n_parameter_trials = 50


df = pd.read_parquet('curated_dataset.parquet')
regex_filter = '|'.join(class_map.values())
filtered_df = df[df['class'].str.contains(regex_filter, regex=True)]
bootstrapped_df = filtered_df.sample(n=n_bootstrap_samples, random_state=25)
X = np.zeros((len(bootstrapped_df), 120, 120))
y = np.zeros(len(bootstrapped_df))

for key, val in class_map.items():
  y[bootstrapped_df['class'].str.contains(val, regex=True)] = key

print('Dataset balance:')
for k,v in class_map.items():
  print(f'class {v} has {np.sum(y == k)} items')
print('-----------------------------------------')

print('LOAD IMAGES')
paths = bootstrapped_df['png_loc'].str.replace('dr5', 'curated_imgs')
with tqdm(total=len(paths)) as progress:
  for idx, file_path in enumerate(paths):
    with Image.open(file_path) as img:
      X[idx,:,:] = np.array(img)
      progress.update()
X_flattened = X.reshape(X.shape[0], -1)



def objective(trial, methods):
  dr_method = trial.suggest_categorical('dr_method', methods)

  if dr_method == 't-SNE':
    # for t-SNE # of components should be no greater that # of samples and # of features
    n_components = trial.suggest_int('n_components', 2, np.min([200, X.shape[0]-1, X.shape[1]-1]))
    perplexity = trial.suggest_int('perplexity', 5, min(50, X.shape[0]-1)) # perplexity < samples
    learning_rate = trial.suggest_float('learning_rate', 10, 1000, log=True)
    reducer = TSNE(n_components=n_components, perplexity=perplexity,
                    learning_rate=learning_rate, method='exact')
  elif dr_method == 'LLE':
    n_components = trial.suggest_int('n_components', 2, min(200, X.shape[0]-1)) # components < samples
    n_neighbors = trial.suggest_int('n_neighbors', min(5, n_components),
                                    max(50, n_components)) # neighbors <= samples
    reducer = LocallyLinearEmbedding(n_components=n_components, n_neighbors=n_neighbors)
  elif dr_method == 'Isomap':
    n_components = trial.suggest_int('n_components', 2, min(200, X.shape[0]-1)) # components < samples
    n_neighbors = trial.suggest_int('n_neighbors', 10, min(50, X.shape[0]//2))
    reducer = Isomap(n_components=n_components, n_neighbors=n_neighbors)
  elif dr_method == 'PCA':
    n_components = trial.suggest_int('n_components', 2, np.min([200, X.shape[0]-1, X.shape[1]-1]))
    reducer = PCA(n_components=n_components)
  elif dr_method == 'uMap':
    n_components = trial.suggest_int('n_components', 2, np.min([200, X.shape[0]//2, X.shape[1]-1]))
    n_neighbors = trial.suggest_int('n_neighbors', 5, np.min(50, X[0]-1)) # neighbors < samples
    reducer = UMAP(n_neighbors=n_neighbors, n_components=n_components)


  try: #ISOMAP fails without any reason
    X_reduced = reducer.fit_transform(X_flattened)
  except ValueError as e:
    print(f"Skipping {method} trial due to error: {e}")
    return -np.inf # not to spoil the result

  #n_estimators = trial.suggest_int('n_estimators', 50, 300)
  #clf = RandomForestClassifier(n_estimators=n_estimators)
  clf = make_pipeline(StandardScaler(),
                      LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42))

  return np.mean(cross_val_score(clf, X_reduced, y, cv=5))

# ignore annoying futurewarnings
import warnings
warnings.filterwarnings("ignore", message=".*force_all_finite.*", category=FutureWarning)

result = []
for method in methods:
  print(f'********************************{method}***************************')
  study = optuna.create_study(direction="maximize")
  study.optimize(lambda T: objective(T, [method]),
                n_trials=n_parameter_trials, show_progress_bar=True, n_jobs=2)
  print("Best parameters:", study.best_params, "Best value:", study.best_value)
  result.append(study.best_params)
  result[-1]['best_vavlue'] = study.best_value


pretty_json_str = pprint.pformat(result, compact=True).replace("'",'"')
with open("results.json", "w") as outfile:
    outfile.write(pretty_json_str)

Dataset balance:
class round has 17 items
class inbetween has 26 items
class cigar has 7 items
-----------------------------------------
LOAD IMAGES


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-02-13 12:06:32,764] A new study created in memory with name: no-name-0b3a7c51-188c-4e7a-a1d3-f64ce47c6839


********************************t-SNE***************************


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-02-13 12:06:34,985] Trial 1 finished with value: 0.38 and parameters: {'dr_method': 't-SNE', 'n_components': 22, 'perplexity': 20, 'learning_rate': 89.00329319598727}. Best is trial 1 with value: 0.38.
[I 2025-02-13 12:06:35,149] Trial 0 finished with value: 0.4600000000000001 and parameters: {'dr_method': 't-SNE', 'n_components': 42, 'perplexity': 47, 'learning_rate': 18.682969462645865}. Best is trial 0 with value: 0.4600000000000001.
[I 2025-02-13 12:06:37,182] Trial 3 finished with value: 0.5199999999999999 and parameters: {'dr_method': 't-SNE', 'n_components': 2, 'perplexity': 14, 'learning_rate': 720.7895180263499}. Best is trial 3 with value: 0.5199999999999999.
[I 2025-02-13 12:06:37,741] Trial 2 finished with value: 0.36 and parameters: {'dr_method': 't-SNE', 'n_components': 15, 'perplexity': 47, 'learning_rate': 32.147815241080394}. Best is trial 3 with value: 0.5199999999999999.
[I 2025-02-13 12:06:38,050] Trial 4 finished with value: 0.26 and parameters: {'dr_method

[I 2025-02-13 12:07:21,053] A new study created in memory with name: no-name-7cb90f97-88ac-4043-bdba-937a4789aa56


[I 2025-02-13 12:07:21,045] Trial 49 finished with value: 0.42000000000000004 and parameters: {'dr_method': 't-SNE', 'n_components': 14, 'perplexity': 35, 'learning_rate': 11.701520405187434}. Best is trial 34 with value: 0.6199999999999999.
Best parameters: {'dr_method': 't-SNE', 'n_components': 19, 'perplexity': 20, 'learning_rate': 63.950780445407204} Best value: 0.6199999999999999
********************************LLE***************************


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-02-13 12:07:21,289] Trial 0 finished with value: 0.43999999999999995 and parameters: {'dr_method': 'LLE', 'n_components': 45, 'n_neighbors': 23}. Best is trial 0 with value: 0.43999999999999995.
[I 2025-02-13 12:07:21,425] Trial 1 finished with value: 0.48 and parameters: {'dr_method': 'LLE', 'n_components': 19, 'n_neighbors': 41}. Best is trial 1 with value: 0.48.
[I 2025-02-13 12:07:21,538] Trial 3 finished with value: 0.62 and parameters: {'dr_method': 'LLE', 'n_components': 45, 'n_neighbors': 10}. Best is trial 3 with value: 0.62.
[I 2025-02-13 12:07:21,605] Trial 2 finished with value: 0.5 and parameters: {'dr_method': 'LLE', 'n_components': 42, 'n_neighbors': 38}. Best is trial 3 with value: 0.62.
[I 2025-02-13 12:07:21,700] Trial 5 finished with value: 0.66 and parameters: {'dr_method': 'LLE', 'n_components': 18, 'n_neighbors': 12}. Best is trial 5 with value: 0.66.
[I 2025-02-13 12:07:21,847] Trial 4 finished with value: 0.42000000000000004 and parameters: {'dr_method':

[I 2025-02-13 12:07:25,961] A new study created in memory with name: no-name-73ee65ba-610f-4903-9799-054ec84be155


[I 2025-02-13 12:07:25,949] Trial 49 finished with value: 0.48 and parameters: {'dr_method': 'LLE', 'n_components': 25, 'n_neighbors': 29}. Best is trial 34 with value: 0.76.
Best parameters: {'dr_method': 'LLE', 'n_components': 18, 'n_neighbors': 10} Best value: 0.76
********************************Isomap***************************


  0%|          | 0/50 [00:00<?, ?it/s]

Skipping Isomap trial due to error: There are significant negative eigenvalues (0.0276908 of the maximum positive). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix.
[I 2025-02-13 12:07:26,037] Trial 0 finished with value: -inf and parameters: {'dr_method': 'Isomap', 'n_components': 45, 'n_neighbors': 23}. Best is trial 0 with value: -inf.
[I 2025-02-13 12:07:26,111] Trial 1 finished with value: 0.45999999999999996 and parameters: {'dr_method': 'Isomap', 'n_components': 24, 'n_neighbors': 11}. Best is trial 1 with value: 0.45999999999999996.
[I 2025-02-13 12:07:26,156] Trial 2 finished with value: 0.45999999999999996 and parameters: {'dr_method': 'Isomap', 'n_components': 27, 'n_neighbors': 10}. Best is trial 1 with value: 0.45999999999999996.
Skipping Isomap trial due to error: There are significant negative eigenvalues (0.0134683 of the maximum positive). Either the matrix is not PSD, or there was an issue while computing the ei

[I 2025-02-13 12:07:29,282] A new study created in memory with name: no-name-f6030463-01d7-481f-828f-4478e2498289


[I 2025-02-13 12:07:29,173] Trial 47 finished with value: 0.5 and parameters: {'dr_method': 'Isomap', 'n_components': 22, 'n_neighbors': 13}. Best is trial 45 with value: 0.5800000000000001.
[I 2025-02-13 12:07:29,222] Trial 48 finished with value: 0.42000000000000004 and parameters: {'dr_method': 'Isomap', 'n_components': 15, 'n_neighbors': 17}. Best is trial 45 with value: 0.5800000000000001.
[I 2025-02-13 12:07:29,266] Trial 49 finished with value: 0.4 and parameters: {'dr_method': 'Isomap', 'n_components': 17, 'n_neighbors': 16}. Best is trial 45 with value: 0.5800000000000001.
Best parameters: {'dr_method': 'Isomap', 'n_components': 22, 'n_neighbors': 14} Best value: 0.5800000000000001
********************************PCA***************************


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-02-13 12:07:29,657] Trial 1 finished with value: 0.43999999999999995 and parameters: {'dr_method': 'PCA', 'n_components': 36}. Best is trial 1 with value: 0.43999999999999995.
[I 2025-02-13 12:07:29,690] Trial 0 finished with value: 0.45999999999999996 and parameters: {'dr_method': 'PCA', 'n_components': 34}. Best is trial 0 with value: 0.45999999999999996.
[I 2025-02-13 12:07:29,845] Trial 2 finished with value: 0.4 and parameters: {'dr_method': 'PCA', 'n_components': 10}. Best is trial 0 with value: 0.45999999999999996.
[I 2025-02-13 12:07:29,851] Trial 3 finished with value: 0.38 and parameters: {'dr_method': 'PCA', 'n_components': 5}. Best is trial 0 with value: 0.45999999999999996.
[I 2025-02-13 12:07:30,045] Trial 4 finished with value: 0.43999999999999995 and parameters: {'dr_method': 'PCA', 'n_components': 18}. Best is trial 0 with value: 0.45999999999999996.
[I 2025-02-13 12:07:30,058] Trial 5 finished with value: 0.41999999999999993 and parameters: {'dr_method': 'PCA'

[I 2025-02-13 12:07:35,689] A new study created in memory with name: no-name-3fe01199-c373-458e-93bf-865a1281d240


[I 2025-02-13 12:07:35,680] Trial 49 finished with value: 0.38 and parameters: {'dr_method': 'PCA', 'n_components': 21}. Best is trial 15 with value: 0.52.
Best parameters: {'dr_method': 'PCA', 'n_components': 30} Best value: 0.52
********************************uMap***************************


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-02-13 12:07:44,705] Trial 0 finished with value: 0.4 and parameters: {'dr_method': 'uMap', 'n_components': 19, 'n_neighbors': 10}. Best is trial 0 with value: 0.4.
[I 2025-02-13 12:07:44,793] Trial 1 finished with value: 0.48 and parameters: {'dr_method': 'uMap', 'n_components': 15, 'n_neighbors': 19}. Best is trial 1 with value: 0.48.
[I 2025-02-13 12:07:45,096] Trial 2 finished with value: 0.37999999999999995 and parameters: {'dr_method': 'uMap', 'n_components': 4, 'n_neighbors': 36}. Best is trial 1 with value: 0.48.
[I 2025-02-13 12:07:45,389] Trial 3 finished with value: 0.43999999999999995 and parameters: {'dr_method': 'uMap', 'n_components': 19, 'n_neighbors': 26}. Best is trial 1 with value: 0.48.
[I 2025-02-13 12:07:45,531] Trial 4 finished with value: 0.5 and parameters: {'dr_method': 'uMap', 'n_components': 23, 'n_neighbors': 14}. Best is trial 4 with value: 0.5.
[I 2025-02-13 12:07:45,660] Trial 6 finished with value: 0.4 and parameters: {'dr_method': 'uMap', 'n_com

  warn(


[I 2025-02-13 12:07:46,670] Trial 11 finished with value: 0.48 and parameters: {'dr_method': 'uMap', 'n_components': 25, 'n_neighbors': 49}. Best is trial 10 with value: 0.52.
[I 2025-02-13 12:07:46,791] Trial 12 finished with value: 0.54 and parameters: {'dr_method': 'uMap', 'n_components': 10, 'n_neighbors': 50}. Best is trial 12 with value: 0.54.
[I 2025-02-13 12:07:46,971] Trial 14 finished with value: 0.52 and parameters: {'dr_method': 'uMap', 'n_components': 10, 'n_neighbors': 49}. Best is trial 12 with value: 0.54.
[I 2025-02-13 12:07:47,063] Trial 13 finished with value: 0.5399999999999999 and parameters: {'dr_method': 'uMap', 'n_components': 9, 'n_neighbors': 36}. Best is trial 12 with value: 0.54.
[I 2025-02-13 12:07:47,219] Trial 15 finished with value: 0.43999999999999995 and parameters: {'dr_method': 'uMap', 'n_components': 11, 'n_neighbors': 40}. Best is trial 12 with value: 0.54.
[I 2025-02-13 12:07:47,351] Trial 16 finished with value: 0.45999999999999996 and parameters

  warn(
  warn(


[I 2025-02-13 12:07:48,701] Trial 25 finished with value: 0.5399999999999999 and parameters: {'dr_method': 'uMap', 'n_components': 12, 'n_neighbors': 39}. Best is trial 20 with value: 0.5599999999999999.
[I 2025-02-13 12:07:48,813] Trial 26 finished with value: 0.45999999999999996 and parameters: {'dr_method': 'uMap', 'n_components': 12, 'n_neighbors': 50}. Best is trial 20 with value: 0.5599999999999999.
[I 2025-02-13 12:07:49,018] Trial 27 finished with value: 0.43999999999999995 and parameters: {'dr_method': 'uMap', 'n_components': 16, 'n_neighbors': 50}. Best is trial 20 with value: 0.5599999999999999.
[I 2025-02-13 12:07:49,167] Trial 28 finished with value: 0.48 and parameters: {'dr_method': 'uMap', 'n_components': 15, 'n_neighbors': 45}. Best is trial 20 with value: 0.5599999999999999.
[I 2025-02-13 12:07:49,317] Trial 30 finished with value: 0.4 and parameters: {'dr_method': 'uMap', 'n_components': 3, 'n_neighbors': 5}. Best is trial 20 with value: 0.5599999999999999.
[I 2025-0

In [None]:
with open('results.json') as json_file:
  data = json.load(json_file)
data