In [1]:
import pandas as pd
import utils
import os

### Merge SDSS and image mapping columns into spec_sample and phot_sample

In [None]:
spec_sample = pd.read_csv("data/zoo2MainSpecz.csv")
phot_sample = pd.read_csv("data/zoo2MainPhotoz.csv")
SDSS = pd.read_csv("data/gz2sample.csv")
image_mapping = pd.read_csv("data/3565489/gz2_filename_mapping.csv")

In [None]:
# Merge spec_sample with SDSS on the dr7objid and OBJID columns
spec_sample = spec_sample.merge(SDSS[['OBJID', 'REDSHIFT', 'REDSHIFTERR',
                                      'PETROR50_R', 'PETROR90_R', 'PETROMAG_U',
                                      'PETROMAG_G', 'PETROMAG_R', 'PETROMAG_I',
                                      'PETROMAG_Z', 'PETROMAGERR_U', 'PETROMAGERR_G',
                                      'PETROMAGERR_R', 'PETROMAGERR_I', 'PETROMAGERR_Z']], 
                                left_on='dr7objid', 
                                right_on='OBJID', 
                                how='left')

phot_sample = phot_sample.merge(SDSS[['OBJID', 'PETROR50_R', 'PETROR90_R',
                                      'PETROMAG_U', 'PETROMAG_G', 'PETROMAG_R', 'PETROMAG_I',
                                      'PETROMAG_Z', 'PETROMAGERR_U', 'PETROMAGERR_G',
                                      'PETROMAGERR_R', 'PETROMAGERR_I', 'PETROMAGERR_Z']], 
                                left_on='dr7objid', 
                                right_on='OBJID', 
                                how='left')

# Drop the redundant OBJID column after merging
spec_sample.drop(columns=['OBJID'], inplace=True)
phot_sample.drop(columns=['OBJID'], inplace=True)

In [None]:
# Merge spec_sample with SDSS on the dr7objid and OBJID columns
spec_sample = spec_sample.merge(image_mapping[['objid', 'asset_id']], 
                                left_on='dr7objid', 
                                right_on='objid', 
                                how='left')

phot_sample = phot_sample.merge(image_mapping[['objid', 'asset_id']], 
                                left_on='dr7objid', 
                                right_on='objid', 
                                how='left')

# Drop the redundant objid column after merging
spec_sample.drop(columns=['objid'], inplace=True)
phot_sample.drop(columns=['objid'], inplace=True)

In [None]:
spec_sample.to_csv("data/spec_sample.csv", index=False)
phot_sample.to_csv("data/phot_sample.csv", index=False)

### Remove entries in spec_sample without images present

In [None]:
df = pd.read_csv("data/spec_sample.csv")

In [None]:
missing_asset_ids = []

for index, row in df.iterrows():
    file_path = f"data/3565489/images/{row['asset_id']}.jpg"
    if os.path.exists(file_path):
        pass
        # print(f"File exists: {file_path}")
    else:
        # print(f"{file_path}", row['asset_id'])
        missing_asset_ids.append(row['asset_id'])
        
print(len(missing_asset_ids) / len(df))

In [None]:
df = df[~df['asset_id'].isin(missing_asset_ids)]
df.to_csv("data/phot_sample.csv", index=False)

### Test utils

In [None]:
utils.show_image_by_assetid(523)

### Simplify morphology classification column and add ancillary classification columns

In [2]:
df = pd.read_csv("data/spec_sample.csv", index_col=None)

In [3]:
special_features = {
    'ring': r'\(r\)',
    'lens/arc': r'\(l\)',
    'disturbed': r'\(d\)',
    'irregular': r'\(i\)',
    'other': r'\(o\)',
    'merger': r'\(m\)',
    'dust lane': r'\(u\)',
}

df['morphology'] = df['gz2class']

for feature, pattern in special_features.items():
    df[feature] = 0
    df.loc[df['gz2class'].str.contains(pattern, na=False), feature] = 1
    df['morphology'] = df['morphology'].str.replace(pattern, '', regex=True)



In [4]:
df['arm_count'] = df['morphology'].apply(lambda x: x[-2] if (x.startswith('S')) and (len(x) >3) else None)
df['arm_winding'] = df['morphology'].apply(lambda x: x[-1] if (x.startswith('S')) and (len(x) >3) else None)

df['morphology'] = df['morphology'].apply(lambda x: x[:-2] if (x.startswith('S') and len(x) > 3) else x)

In [5]:
df.to_csv("data/spec_sample.csv", index=False)

In [None]:
for asset_id in df.loc[df['arm_winding'] == 't', 'asset_id'].to_numpy()[0:5]:
    print(asset_id)
    utils.show_image_by_assetid(asset_id)
    