In [1]:
import hashlib

from astropy.table import Table
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = Table.read('/nvme1/scratch/walml/repos/pytorch-galaxy-datasets/hidden_data/gz_candels_table_2_main_release_withURLs.fits').to_pandas()

for str_col in ['ID', 'location_standard']:
    df[str_col] = df[str_col].apply(lambda x: x.decode('utf-8'))

df['location_standard'] = df['location_standard'].str.rstrip()  # often has trailing whitespace, I assume to be standard length

In [3]:
df

Unnamed: 0,ID,RA,Dec,num_classifications,num_classifications_weighted,t00_smooth_or_featured_a0_smooth_frac,t00_smooth_or_featured_a1_features_frac,t00_smooth_or_featured_a2_artifact_frac,t00_smooth_or_featured_a0_smooth_weighted_frac,t00_smooth_or_featured_a1_features_weighted_frac,...,t16_merging_tidal_debris_count,t16_merging_tidal_debris_weight,measured_corr,clean_smooth,clean_featured,clean_clumpy,clean_edge_on,clean_spiral,smooth_disk,location_standard
0,COS_1,150.056820,2.173421,38.0,38.320458,0.026316,0.078947,0.894737,0.000883,0.059558,...,4.0,3.186442,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
1,COS_10,150.056860,2.174979,39.0,38.169358,0.128205,0.025641,0.846154,0.137607,0.041204,...,6.0,5.540188,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
2,COS_1000,150.084970,2.191447,76.0,73.932310,0.644737,0.250000,0.105263,0.714415,0.268447,...,68.0,69.052986,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
3,COS_10000,150.082170,2.299318,39.0,37.427298,0.512821,0.051282,0.435897,0.727801,0.108449,...,22.0,24.196523,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
4,COS_10002,150.203340,2.299440,36.0,34.376101,0.444444,0.083333,0.472222,0.702102,0.082279,...,19.0,18.077062,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49550,UDS_9993,34.385551,-5.220046,36.0,34.858319,0.611111,0.055556,0.333333,0.795564,0.021391,...,24.0,24.000000,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
49551,UDS_9994,34.399144,-5.220006,36.0,32.899298,0.555556,0.055556,0.388889,0.736519,0.035786,...,22.0,21.464227,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
49552,UDS_9995,34.297177,-5.219938,24.0,24.000000,0.416667,0.208333,0.375000,0.530427,0.280443,...,15.0,15.000000,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
49553,UDS_9997,34.319162,-5.219902,38.0,36.207986,0.526316,0.105263,0.368421,0.655580,0.120669,...,24.0,23.464227,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...


In [4]:
df['location_standard'][0]

'http://www.galaxyzoo.org.s3.amazonaws.com/subjects/standard/COS_1.jpg'

In [5]:
df['location_standard'].sample(1).squeeze()

'http://www.galaxyzoo.org.s3.amazonaws.com/subjects/standard/COS_32719.jpg'

In [6]:
len(df)

49555

In [7]:
candels_urls_to_download_loc = '/nvme1/scratch/walml/repos/pytorch-galaxy-datasets/hidden_data/candels_urls.txt'

In [8]:
with open(candels_urls_to_download_loc, 'w') as f:
    f.writelines('\n'.join(list(df['location_standard'])))

In [9]:
print(f'cat {candels_urls_to_download_loc} | xargs -n100 -P8 wget --continue --directory-prefix /nvme1/scratch/walml/repos/pytorch-galaxy-datasets/roots/candels/images')

# batch size of 100 (urls), 8 processes

cat /nvme1/scratch/walml/repos/pytorch-galaxy-datasets/hidden_data/candels_urls.txt | xargs -n100 -P8 wget --continue --directory-prefix /nvme1/scratch/walml/repos/pytorch-galaxy-datasets/roots/candels/images


Also modify the catalog to follow my schema/label_cols convention

In [10]:
df_before_rename = df.copy()
df_before_rename.columns.values

array(['ID', 'RA', 'Dec', 'num_classifications',
       'num_classifications_weighted',
       't00_smooth_or_featured_a0_smooth_frac',
       't00_smooth_or_featured_a1_features_frac',
       't00_smooth_or_featured_a2_artifact_frac',
       't00_smooth_or_featured_a0_smooth_weighted_frac',
       't00_smooth_or_featured_a1_features_weighted_frac',
       't00_smooth_or_featured_a2_artifact_weighted_frac',
       't00_smooth_or_featured_count', 't00_smooth_or_featured_weight',
       't01_how_rounded_a0_completely_frac',
       't01_how_rounded_a1_inbetween_frac',
       't01_how_rounded_a2_cigarshaped_frac',
       't01_how_rounded_a0_completely_weighted_frac',
       't01_how_rounded_a1_inbetween_weighted_frac',
       't01_how_rounded_a2_cigarshaped_weighted_frac',
       't01_how_rounded_count', 't01_how_rounded_weight',
       't02_clumpy_appearance_a0_yes_frac',
       't02_clumpy_appearance_a1_no_frac',
       't02_clumpy_appearance_a0_yes_weighted_frac',
       't02_clumpy_app

In [11]:

# rename the questions (for count columns)
question_renamer = {
    't00_smooth_or_featured': 'smooth-or-featured',
    't01_how_rounded': 'how-rounded',
    't02_clumpy_appearance': 'clumpy-appearance',
    't03_how_many_clumps': 'clump-count',
    't04_clump_configuration': 'clump-configuration',
    't05_is_one_clump_brightest': 'one-clump-brightest',
    't06_brightest_clump_central': 'brightest-clump-central',
    't07_galaxy_symmetrical': 'galaxy-symmetrical',
    't08_clumps_embedded_larger_object': 'clumps-embedded-larger-object',
    't09_disk_edge_on': 'disk-edge-on',
    't10_edge_on_bulge': 'edge-on-bulge',
    't11_bar_feature': 'bar',
    't12_spiral_pattern': 'has-spiral-arms',
    't13_spiral_arm_winding': 'spiral-winding',
    't14_spiral_arm_count': 'spiral-arm-count',
    't15_bulge_prominence': 'bulge-size',
    't16_merging_tidal_debris': 'merging'
}

# define the new schema
# this has been moved to the prepared dataset itself - see prepared_datasets/candels
from pytorch_galaxy_datasets.prepared_datasets import candels



In [12]:




# rename the _frac's according to new schema
column_renamer = {
    't00_smooth_or_featured_a0_smooth_frac': 'smooth-or-featured_smooth_fraction',
    't00_smooth_or_featured_a1_features_frac': 'smooth-or-featured_features_fraction',
    't00_smooth_or_featured_a2_artifact_frac': 'smooth-or-featured_artifact_fraction',
    't01_how_rounded_a0_completely_frac': 'how-rounded_completely_fraction',
    't01_how_rounded_a1_inbetween_frac': 'how-rounded_in-between_fraction',
    't01_how_rounded_a2_cigarshaped_frac': 'how-rounded_cigar-shaped_fraction',
    't02_clumpy_appearance_a0_yes_frac': 'clumpy-appearance_yes_fraction',
    't02_clumpy_appearance_a1_no_frac': 'clumpy-appearance_no_fraction',
    't03_how_many_clumps_a0_1_frac': 'clump-count_1_fraction',
    't03_how_many_clumps_a1_2_frac': 'clump-count_2_fraction',
    't03_how_many_clumps_a2_3_frac': 'clump-count_3_fraction',
    't03_how_many_clumps_a3_4_frac': 'clump-count_4_fraction',
    't03_how_many_clumps_a4_5_plus_frac': 'clump-count_5-plus_fraction',
    't03_how_many_clumps_a5_cant_tell_frac': 'clump-count_cant-tell_fraction',
    't04_clump_configuration_a0_straight_line_frac': 'clump-configuration_straight-line_fraction',
    't04_clump_configuration_a1_chain_frac': 'clump-configuration_chain_fraction',
    't04_clump_configuration_a2_cluster_or_irregular_frac': 'clump-configuration_cluster-or-irregular_fraction',
    't04_clump_configuration_a3_spiral_frac': 'clump-configuration_spiral_fraction',
    't05_is_one_clump_brightest_a0_yes_frac': 'one-clump-brightest_yes_fraction',
    't05_is_one_clump_brightest_a1_no_frac': 'one-clump-brightest_no_fraction',
    't06_brightest_clump_central_a0_yes_frac': 'brightest-clump-central_yes_fraction',
    't06_brightest_clump_central_a1_no_frac': 'brightest-clump-central_no_fraction',
    't07_galaxy_symmetrical_a0_yes_frac': 'galaxy-symmetrical_yes_fraction',
    't07_galaxy_symmetrical_a1_no_frac': 'galaxy-symmetrical_no_fraction',
    't08_clumps_embedded_larger_object_a0_yes_frac': 'clumps-embedded-larger-object_yes_fraction',
    't08_clumps_embedded_larger_object_a1_no_frac': 'clumps-embedded-larger-object_no_fraction',
    't09_disk_edge_on_a0_yes_frac': 'disk-edge-on_yes_fraction',
    't09_disk_edge_on_a1_no_frac': 'disk-edge-on_no_fraction',
    't10_edge_on_bulge_a0_yes_frac': 'edge-on-bulge_yes_fraction',
    't10_edge_on_bulge_a1_no_frac': 'edge-on-bulge_no_fraction',
    't11_bar_feature_a0_yes_frac': 'bar_yes_fraction',
    't11_bar_feature_a1_no_frac': 'bar_no_fraction',
    't12_spiral_pattern_a0_yes_frac': 'has-spiral-arms_yes_fraction',
    't12_spiral_pattern_a1_no_frac': 'has-spiral-arms_no_fraction',
    't13_spiral_arm_winding_a0_tight_frac': 'spiral-winding_tight_fraction',
    't13_spiral_arm_winding_a1_medium_frac': 'spiral-winding_medium_fraction',
    't13_spiral_arm_winding_a2_loose_frac': 'spiral-winding_loose_fraction',
    't14_spiral_arm_count_a0_1_frac': 'spiral-arm-count_1_fraction',
    't14_spiral_arm_count_a1_2_frac': 'spiral-arm-count_2_fraction',
    't14_spiral_arm_count_a2_3_frac': 'spiral-arm-count_3_fraction',
    't14_spiral_arm_count_a3_4_frac': 'spiral-arm-count_4_fraction',
    't14_spiral_arm_count_a4_5_plus_frac': 'spiral-arm-count_5-plus_fraction',
    't14_spiral_arm_count_a5_cant_tell_frac': 'spiral-arm-count_cant-tell_fraction',
    't15_bulge_prominence_a0_no_bulge_frac': 'bulge-size_none_fraction',
    't15_bulge_prominence_a1_obvious_frac': 'bulge-size_obvious_fraction',
    't15_bulge_prominence_a2_dominant_frac': 'bulge-size_dominant_fraction',
    't16_merging_tidal_debris_a0_merging_frac': 'merging_merger_fraction',
    't16_merging_tidal_debris_a1_tidal_debris_frac': 'merging_tidal-debris_fraction',
    't16_merging_tidal_debris_a2_both_frac': 'merging_both_fraction',
    't16_merging_tidal_debris_a3_neither_frac': 'merging_neither_fraction'
}

# rename to ortho style

df = df_before_rename.rename(columns=column_renamer)

for col in df.columns.values:
    if 'weight' in col:
        del df[col]

[x for x in df.columns.values if '_frac' in x]

['smooth-or-featured_smooth_fraction',
 'smooth-or-featured_features_fraction',
 'smooth-or-featured_artifact_fraction',
 'how-rounded_completely_fraction',
 'how-rounded_in-between_fraction',
 'how-rounded_cigar-shaped_fraction',
 'clumpy-appearance_yes_fraction',
 'clumpy-appearance_no_fraction',
 'clump-count_1_fraction',
 'clump-count_2_fraction',
 'clump-count_3_fraction',
 'clump-count_4_fraction',
 'clump-count_5-plus_fraction',
 'clump-count_cant-tell_fraction',
 'clump-configuration_straight-line_fraction',
 'clump-configuration_chain_fraction',
 'clump-configuration_cluster-or-irregular_fraction',
 'clump-configuration_spiral_fraction',
 'one-clump-brightest_yes_fraction',
 'one-clump-brightest_no_fraction',
 'brightest-clump-central_yes_fraction',
 'brightest-clump-central_no_fraction',
 'galaxy-symmetrical_yes_fraction',
 'galaxy-symmetrical_no_fraction',
 'clumps-embedded-larger-object_yes_fraction',
 'clumps-embedded-larger-object_no_fraction',
 'disk-edge-on_yes_fraction

In [13]:


new_questions = question_renamer.values()
# assert set(new_questions) == set(candels.candels_pairs.keys())
# set(new_questions) - set(candels.candels_pairs.keys())


In [14]:
# naively this is okay, but due to the rare rounding issue below, I'm not sure the count column is correct
# therefore

# also rename the count columns
# relies on implicitly ordered dict, fine in python 3.7
old_count_cols = [x + '_count' for x in question_renamer.keys()]
new_count_cols = [x + '_total-votes' for x in question_renamer.values()]

count_renamer = dict(zip(old_count_cols, new_count_cols))
df = df.rename(columns=count_renamer)

In [15]:
df

Unnamed: 0,ID,RA,Dec,num_classifications,smooth-or-featured_smooth_fraction,smooth-or-featured_features_fraction,smooth-or-featured_artifact_fraction,smooth-or-featured_total-votes,how-rounded_completely_fraction,how-rounded_in-between_fraction,...,merging_neither_fraction,merging_total-votes,measured_corr,clean_smooth,clean_featured,clean_clumpy,clean_edge_on,clean_spiral,smooth_disk,location_standard
0,COS_1,150.056820,2.173421,38.0,0.026316,0.078947,0.894737,38.0,0.000000,1.000000,...,0.750000,4.0,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
1,COS_10,150.056860,2.174979,39.0,0.128205,0.025641,0.846154,39.0,0.600000,0.400000,...,0.833333,6.0,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
2,COS_1000,150.084970,2.191447,76.0,0.644737,0.250000,0.105263,76.0,0.000000,0.877551,...,0.911765,68.0,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
3,COS_10000,150.082170,2.299318,39.0,0.512821,0.051282,0.435897,39.0,0.800000,0.200000,...,0.954545,22.0,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
4,COS_10002,150.203340,2.299440,36.0,0.444444,0.083333,0.472222,36.0,0.562500,0.437500,...,0.842105,19.0,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49550,UDS_9993,34.385551,-5.220046,36.0,0.611111,0.055556,0.333333,36.0,0.954545,0.045455,...,0.958333,24.0,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
49551,UDS_9994,34.399144,-5.220006,36.0,0.555556,0.055556,0.388889,36.0,0.850000,0.150000,...,0.863636,22.0,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
49552,UDS_9995,34.297177,-5.219938,24.0,0.416667,0.208333,0.375000,24.0,0.500000,0.500000,...,0.266667,15.0,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...
49553,UDS_9997,34.319162,-5.219902,38.0,0.526316,0.105263,0.368421,38.0,0.500000,0.450000,...,0.625000,24.0,False,False,False,False,False,False,False,http://www.galaxyzoo.org.s3.amazonaws.com/subj...


In [16]:


# calculate the answer counts according to new schema
for question, answers in candels.candels_pairs.items():
    for answer in answers:
        counts = df[question + '_total-votes'] * df[question + answer + '_fraction']

        # check that they are all integers - they aren't, and I've asked Brooke about this
        # rounded_counts = np.around(counts)
        # rounded_count_matches_counts = np.isclose(df[question + answer], np.around(df[question + answer]))
        # if not rounded_count_matches_counts.all():
        #     print(question, answer, rounded_count_matches_counts.mean())
        #     raise ValueError(counts[~rounded_count_matches_counts])

        # temporarily, just round them
        rounded_counts = np.around(counts)

        df[question + answer] = rounded_counts

    answer_cols = [question + answer for answer in answers]
    total_from_new_counts = df[answer_cols].sum(axis=1)
    # total_from_raw_counts = df[question + '_total-votes']
    # totals_match = total_from_new_counts == total_from_raw_counts
    # if not totals_match.all():
    #     print(question, answer, totals_match.mean())
    #     raise ValueError(total_from_raw_counts[~totals_match], total_from_new_counts[~totals_match])
    df[question + '_total-votes'] = total_from_new_counts  # for now, also update the total-votes from the rounded counts. Ensures dirichlet will be happy.


        

In [17]:
row = df_before_rename.iloc[20221]
row['t03_how_many_clumps_a0_1_frac'], row['t03_how_many_clumps_count'], row['t03_how_many_clumps_a0_1_frac'] * row['t03_how_many_clumps_count']

(0.05, 15.0, 0.75)

Some final column tidying

In [18]:
cols_to_drop = [
    'measured_corr','clean_smooth', 'clean_featured', 'clean_clumpy', 'clean_edge_on',
    'clean_spiral', 'smooth_disk', 'location_standard']  # remove location_standard to not reveal the internal urls (causing AWS bills)

for col in cols_to_drop:
    del df[col]

In [19]:
assert (df['smooth-or-featured_total-votes'] == df['num_classifications']).all()

del df['num_classifications']

In [20]:
df['id_str'] = df['ID'].str.rstrip()
del df['ID']

In [21]:
df = df.rename(columns={
    'RA': 'ra',
    'Dec': 'dec'
})

In [22]:
df.columns.values

array(['ra', 'dec', 'smooth-or-featured_smooth_fraction',
       'smooth-or-featured_features_fraction',
       'smooth-or-featured_artifact_fraction',
       'smooth-or-featured_total-votes',
       'how-rounded_completely_fraction',
       'how-rounded_in-between_fraction',
       'how-rounded_cigar-shaped_fraction', 'how-rounded_total-votes',
       'clumpy-appearance_yes_fraction', 'clumpy-appearance_no_fraction',
       'clumpy-appearance_total-votes', 'clump-count_1_fraction',
       'clump-count_2_fraction', 'clump-count_3_fraction',
       'clump-count_4_fraction', 'clump-count_5-plus_fraction',
       'clump-count_cant-tell_fraction', 'clump-count_total-votes',
       'clump-configuration_straight-line_fraction',
       'clump-configuration_chain_fraction',
       'clump-configuration_cluster-or-irregular_fraction',
       'clump-configuration_spiral_fraction',
       'clump-configuration_total-votes',
       'one-clump-brightest_yes_fraction',
       'one-clump-brightest_no_f

In [23]:
df['filename'] = df['id_str'] + '.jpg'

In [24]:
df['filename'].iloc[0]

'COS_1.jpg'

In [25]:
normal_answer_cols = []
for question, answers in candels.candels_pairs.items():
    for answer in answers:
        normal_answer_cols.append(question + answer)
        normal_answer_cols.append(question + answer + '_fraction')
        normal_answer_cols.append(question + '_total-votes')


ortho_answer_cols = []
for question, answers in candels.candels_ortho_pairs.items():
    for answer in answers:
        ortho_answer_cols.append(question + answer)
        ortho_answer_cols.append(question + answer + '_fraction')
        ortho_answer_cols.append(question + '_total-votes')

df = df.rename(columns=dict(zip(normal_answer_cols, ortho_answer_cols)))

df.columns.values

array(['ra', 'dec', 'smooth-or-featured-candels_smooth_fraction',
       'smooth-or-featured-candels_features_fraction',
       'smooth-or-featured-candels_artifact_fraction',
       'smooth-or-featured-candels_total-votes',
       'how-rounded-candels_completely_fraction',
       'how-rounded-candels_in-between_fraction',
       'how-rounded-candels_cigar-shaped_fraction',
       'how-rounded-candels_total-votes',
       'clumpy-appearance-candels_yes_fraction',
       'clumpy-appearance-candels_no_fraction',
       'clumpy-appearance-candels_total-votes',
       'clump-count-candels_1_fraction', 'clump-count-candels_2_fraction',
       'clump-count-candels_3_fraction', 'clump-count-candels_4_fraction',
       'clump-count-candels_5-plus_fraction',
       'clump-count-candels_cant-tell_fraction',
       'clump-count-candels_total-votes',
       'clump-configuration_straight-line_fraction',
       'clump-configuration_chain_fraction',
       'clump-configuration_cluster-or-irregular_fr

In [26]:
temp = '/home/walml/repos/pytorch-galaxy-datasets/roots/candels/images/' + df['filename']

import os
assert os.path.isfile(temp[0])

is_file = [os.path.isfile(loc) for loc in temp]
np.mean(is_file), len(is_file) -  np.sum(is_file)

(0.999939461204722, 3)

In [27]:
df = df[is_file].reset_index(drop=True)  # 3 are missing, drop these

In [30]:
from PIL import Image

temp = '/home/walml/repos/pytorch-galaxy-datasets/roots/candels/images/' + df['filename']
# for loc in temp:
#     if not Image.open(loc).size == (424, 424):
#         print(loc)
#         break

is_correct_shape_pil = np.array([Image.open(loc).size == (424, 424) for loc in temp])
assert all(is_correct_shape_pil)
# np.sum(~is_correct_shape_pil)

Now split

In [44]:
train_catalog, test_catalog = train_test_split(df, test_size=0.2, random_state=42)

In [45]:
train_loc = '/home/walml/repos/pytorch-galaxy-datasets/roots/candels/candels_ortho_train_catalog.parquet'
test_loc = '/home/walml/repos/pytorch-galaxy-datasets/roots/candels/candels_ortho_test_catalog.parquet'

In [46]:
# train_catalog.to_parquet(train_loc, index=False)
# test_catalog.to_parquet(test_loc, index=False)

In [47]:
images_loc = '/nvme1/scratch/walml/repos/pytorch-galaxy-datasets/roots/candels/candels_images.tar.gz'

In [48]:
for loc in [
    train_loc,
    test_loc,
    images_loc
]:
    # print hash
    with open(loc, 'rb') as f:
        md5_checksum = hashlib.md5(f.read()).hexdigest()

    print(md5_checksum)

1861cedb8d2eb1ebd6e3fe562ae84f93
793cd18874aaa48b5884b564d19dade5
b621ee4e650cf084a1a0c1fe5c9d0a21
