In [1]:
import os

import pandas as pd

from sklearn.model_selection import train_test_split

from zoobot.shared.label_metadata import decals_all_campaigns_ortho_label_cols


In [2]:
dr12 = pd.read_parquet('/home/walml/repos/gz-decals-classifiers/data/catalogs/training_catalogs/dr12_ortho_v2_labelled_catalog.parquet')
dr5 = pd.read_parquet('/home/walml/repos/gz-decals-classifiers/data/catalogs/training_catalogs/dr5_ortho_v2_labelled_catalog.parquet')
dr8 = pd.read_parquet('/home/walml/repos/gz-decals-classifiers/data/catalogs/training_catalogs/dr8_ortho_v2_labelled_catalog.parquet')

In [3]:
megacatalog = pd.read_parquet('/home/walml/repos/decals-rings/results/rings_megacatalog.parquet', columns=['dr8_id', 'iauname', 'redshift'])
megacatalog.sample(5)

Unnamed: 0,dr8_id,iauname,redshift
5206072,430531_1076,,0.212848
4462610,482154_2597,,0.269002
8641681,476140_1345,,0.11806
7054936,225838_2938,,0.120925
8229307,474717_394,J133229.55+253853.4,0.07733


In [4]:
print(len(dr12))
dr12 = pd.merge(dr12, megacatalog, on='iauname', how='inner')
print(len(dr12))

85730
82387


In [5]:
print(len(dr5))
dr5 = pd.merge(dr5, megacatalog, on='iauname', how='inner')
print(len(dr5))

228059
222754


In [6]:
# replace with cross-matched
dr12['id_str'] = dr12['dr8_id']
dr5['id_str'] = dr5['dr8_id']

In [7]:
labelled = pd.concat([dr12, dr5, dr8], axis=0)
len(labelled)

365770

Now make the unlabelled catalog from the remainder

In [8]:
unlabelled = megacatalog[~megacatalog['dr8_id'].isin(labelled['dr8_id'])]
unlabelled['id_str'] = unlabelled['dr8_id']
len(unlabelled)  # mostly high-z, some redshift filtering would be wise

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


8323600

In [9]:
for col in decals_all_campaigns_ortho_label_cols:
    unlabelled[col] = 0
unlabelled = unlabelled.copy().sample(len(unlabelled), random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  


In [10]:
for df in [labelled, unlabelled]:  # dr8 already has but it points to the png
    df['brickid'] = df['id_str'].apply(lambda x: x.split('_')[0])
    df['objid'] = df['id_str'].apply(lambda x: x.split('_')[1])
    df['file_loc'] = '/share/nas2/walml/galaxy_zoo/decals/dr8/jpg/' + df['brickid'] + '/' + df['id_str'] + '.jpg'
    print(df['file_loc'][0])

  
  This is separate from the ipykernel package so we can avoid doing imports until


0    /share/nas2/walml/galaxy_zoo/decals/dr8/jpg/34...
0    /share/nas2/walml/galaxy_zoo/decals/dr8/jpg/33...
0    /share/nas2/walml/galaxy_zoo/decals/dr8/jpg/17...
Name: file_loc, dtype: object
/share/nas2/walml/galaxy_zoo/decals/dr8/jpg/43543/43543_92.jpg


In [11]:
df.columns.values

array(['dr8_id', 'iauname', 'redshift', 'id_str',
       'smooth-or-featured-dr12_smooth',
       'smooth-or-featured-dr12_featured-or-disk',
       'smooth-or-featured-dr12_artifact', 'disk-edge-on-dr12_yes',
       'disk-edge-on-dr12_no', 'has-spiral-arms-dr12_yes',
       'has-spiral-arms-dr12_no', 'bar-dr12_yes', 'bar-dr12_no',
       'bulge-size-dr12_dominant', 'bulge-size-dr12_obvious',
       'bulge-size-dr12_none', 'how-rounded-dr12_completely',
       'how-rounded-dr12_in-between', 'how-rounded-dr12_cigar-shaped',
       'edge-on-bulge-dr12_boxy', 'edge-on-bulge-dr12_none',
       'edge-on-bulge-dr12_rounded', 'spiral-winding-dr12_tight',
       'spiral-winding-dr12_medium', 'spiral-winding-dr12_loose',
       'spiral-arm-count-dr12_1', 'spiral-arm-count-dr12_2',
       'spiral-arm-count-dr12_3', 'spiral-arm-count-dr12_4',
       'spiral-arm-count-dr12_more-than-4', 'merging-dr12_neither',
       'merging-dr12_tidal-debris', 'merging-dr12_both',
       'merging-dr12_merger', '

In [15]:
import numpy as np

In [18]:
np.any(labelled[decals_all_campaigns_ortho_label_cols].isna())

False

In [23]:
any(labelled[decals_all_campaigns_ortho_label_cols].isna())

True

In [22]:
any(labelled[decals_all_campaigns_ortho_label_cols].isna().values.max(axis=1))

False

In [17]:
np.any(labelled[decals_all_campaigns_ortho_label_cols].isna(), axis=1).mean()

0.0

In [12]:
train_catalog, test_catalog = train_test_split(labelled, test_size=0.2, random_state=42)  # note that because we dropped a few rows which couldn't be cross-matched, this is not the same split as the original_images version

In [13]:
# train_catalog.to_parquet('/home/walml/repos/pytorch-galaxy-datasets/roots/legs/legs_all_campaigns_ortho_dr8_only_train_catalog.parquet', index=False)
# test_catalog.to_parquet('/home/walml/repos/pytorch-galaxy-datasets/roots/legs/legs_all_campaigns_ortho_dr8_only_test_catalog.parquet', index=False)

In [14]:
# unlabelled.to_parquet('/home/walml/repos/pytorch-galaxy-datasets/roots/legs/legs_all_campaigns_ortho_dr8_only_unlabelled_catalog.parquet', index=False)

In [15]:
import hashlib

for loc in [
    '/home/walml/repos/pytorch-galaxy-datasets/roots/legs/legs_all_campaigns_ortho_dr8_only_train_catalog.parquet',
    '/home/walml/repos/pytorch-galaxy-datasets/roots/legs/legs_all_campaigns_ortho_dr8_only_test_catalog.parquet',
    '/home/walml/repos/pytorch-galaxy-datasets/roots/legs/legs_all_campaigns_ortho_dr8_only_unlabelled_catalog.parquet'
]:
    # print hash
    with open(loc, 'rb') as f:
        md5_checksum = hashlib.md5(f.read()).hexdigest()

    print(md5_checksum)

FileNotFoundError: [Errno 2] No such file or directory: '/home/walml/repos/pytorch-galaxy-datasets/roots/legs/legs_all_campaigns_ortho_dr8_only_train_catalog.parquet'