In [28]:
%matplotlib inline

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from mattpy.utils import smooth
from swsnet.norm_utils import normalize_spectrum, renormalize_spectrum

# Read metadata in

In [2]:
meta = pd.read_pickle('../metadata.pkl')

In [3]:
nrows = meta.shape[0]
meta

Unnamed: 0,aorkey,object_name,ra,dec,flux_units,file_path
0,10019072,HD 99754,172.096220,-23.829099,Jy,spectra/10019072.pkl
1,10019328,HR 5467,219.560550,54.021404,Jy,spectra/10019328.pkl
2,10019584,HD 173511,280.413770,61.546739,Jy,spectra/10019584.pkl
3,10019840,alp Lac,337.820510,50.284642,Jy,spectra/10019840.pkl
4,10020096,HR 6348,255.315400,60.647838,Jy,spectra/10020096.pkl
5,10020352,HIZODY_Camp9_1,6.428490,-1.768108,Jy,spectra/10020352.pkl
6,10020608,HIZODY_Camp9_1,6.428418,-1.773440,Jy,spectra/10020608.pkl
7,10020864,NGC6543-edge,269.624370,66.632947,Jy,spectra/10020864.pkl
8,10021120,P Cyg,304.443720,38.033915,Jy,spectra/10021120.pkl
9,10021376,HD190429,300.869810,36.026100,Jy,spectra/10021376.pkl


# Determine normalization parameters (and plot-opt)

In [4]:
determine_parameters = False

def norm_and_plot(meta):
    param_list = []
    
    for index, filename in enumerate(meta['file_path']):
        if index % 200 == 0:
            print(index, ' / ', nrows)

        # Full classifier
        try:
            classifier = meta['full_classifier'][index]
        except Exception as e:
            classifier = ''
            
        # Perform shift/renormalization
        parameters = normalize_spectrum(filename, classifier,
                                        plot=False, verbose=False)
        
        # Save parameters to a list
        spec_min, spec_max, norm_factor = parameters
        param_list.append([filename, *parameters])
        
    return param_list

In [5]:
if determine_parameters:
    par_list = norm_and_plot(meta)
    header = 'iso_filename, spec_min, spec_max, norm_factor (shift first, then norm!!)'
    np.savetxt('step1_norm_params.txt', par_list, delimiter=',', fmt='%s',
               header=header)

0  /  6732
200  /  6732
400  /  6732
600  /  6732
800  /  6732
1000  /  6732
1200  /  6732
1400  /  6732
1600  /  6732
1800  /  6732
2000  /  6732
2200  /  6732
2400  /  6732
2600  /  6732
2800  /  6732
3000  /  6732
3200  /  6732
3400  /  6732
3600  /  6732
3800  /  6732
4000  /  6732
4200  /  6732
4400  /  6732
4600  /  6732
4800  /  6732
5000  /  6732
5200  /  6732
5400  /  6732
5600  /  6732
5800  /  6732
6000  /  6732
6200  /  6732
6400  /  6732
6600  /  6732


### Confirm we can read them back in later.

In [6]:
norm_params = np.loadtxt('../step1_norm/step1_norm_params.txt', delimiter=',', dtype='str')
nrows = norm_params.shape[0]

In [7]:
spectra_paths = norm_params.T[0]
spectra_paths

array(['spectra/10019072.pkl', 'spectra/10019328.pkl',
       'spectra/10019584.pkl', ..., 'spectra/9923840.pkl',
       'spectra/9924096.pkl', 'spectra/9924352.pkl'], dtype='<U21')

In [8]:
norm_params[0]

array(['spectra/10019072.pkl', '0.08301383', '3.2870865',
       '3.2040727138519287'], dtype='<U21')

# Perform normalization

In [10]:
file_path_list = []

for index, file_path in enumerate(spectra_paths):
    if index % 100 == 0:
        print(index, ' / ', nrows)
    
    # Normalization parameters for this spectrum:
    norm_factors = norm_params[index]
    
    # Renormalize, save to new pickle.
    save_path = renormalize_spectrum(file_path, norm_factors, verbose=False)
    
    # Do something with meta dataframe??
    file_path_list.append(save_path)
    
#     if index >= 10:
#         break

0  /  6732
100  /  6732
200  /  6732
300  /  6732
400  /  6732
500  /  6732
600  /  6732
700  /  6732
800  /  6732
900  /  6732
1000  /  6732
1100  /  6732
1200  /  6732
1300  /  6732
1400  /  6732
1500  /  6732
1600  /  6732
1700  /  6732
1800  /  6732
1900  /  6732
2000  /  6732
2100  /  6732
2200  /  6732
2300  /  6732
2400  /  6732
2500  /  6732
2600  /  6732
2700  /  6732
2800  /  6732
2900  /  6732
3000  /  6732
3100  /  6732
3200  /  6732
3300  /  6732
3400  /  6732
3500  /  6732
3600  /  6732
3700  /  6732
3800  /  6732
3900  /  6732
4000  /  6732
4100  /  6732
4200  /  6732
4300  /  6732
4400  /  6732
4500  /  6732
4600  /  6732
4700  /  6732
4800  /  6732
4900  /  6732
5000  /  6732
5100  /  6732
5200  /  6732
5300  /  6732
5400  /  6732
5500  /  6732
5600  /  6732
5700  /  6732
5800  /  6732
5900  /  6732
6000  /  6732
6100  /  6732
6200  /  6732
6300  /  6732
6400  /  6732
6500  /  6732
6600  /  6732
6700  /  6732


In [21]:
def update_dataframe(meta, file_path_list):
    
    def check_tdts(old_file_paths, new_file_paths):

        old_list = [x.split('/')[-1].split('.pkl')[0] for x in old_file_paths]
        new_list = [x.split('/')[-1].split('_')[0] for x in new_file_paths]
        
        if old_list != new_list:
            raise SystemExit("TDTs don't match.")

        return    
    
    # Make a copy of the dataframe.
    new_meta = meta.copy()
    
    # Isolate file_path from meta dataframe.
    old_file_paths = meta['file_path']
    new_file_paths = file_path_list
    
    # Compare them by TDT as a sanity check.
    check_tdts(old_file_paths, new_file_paths)
    
    # Update paths.
    new_meta['file_path'] = new_file_paths
    
    # Save to disk.
    new_meta.to_pickle('../metadata_step1_normalized.pkl')
    print('Saved: ', '../metadata_step1_normalized.pkl')
    
    return new_meta

In [22]:
new_meta = update_dataframe(meta, file_path_list)

Saved:  ../metadata_step1_normalized.pkl


In [23]:
meta.head()

Unnamed: 0,aorkey,object_name,ra,dec,flux_units,file_path
0,10019072,HD 99754,172.09622,-23.829099,Jy,spectra/10019072.pkl
1,10019328,HR 5467,219.56055,54.021404,Jy,spectra/10019328.pkl
2,10019584,HD 173511,280.41377,61.546739,Jy,spectra/10019584.pkl
3,10019840,alp Lac,337.82051,50.284642,Jy,spectra/10019840.pkl
4,10020096,HR 6348,255.3154,60.647838,Jy,spectra/10020096.pkl


In [24]:
new_meta.head()

Unnamed: 0,aorkey,object_name,ra,dec,flux_units,file_path
0,10019072,HD 99754,172.09622,-23.829099,Jy,spectra_normalized/10019072_renorm.pkl
1,10019328,HR 5467,219.56055,54.021404,Jy,spectra_normalized/10019328_renorm.pkl
2,10019584,HD 173511,280.41377,61.546739,Jy,spectra_normalized/10019584_renorm.pkl
3,10019840,alp Lac,337.82051,50.284642,Jy,spectra_normalized/10019840_renorm.pkl
4,10020096,HR 6348,255.3154,60.647838,Jy,spectra_normalized/10020096_renorm.pkl


## Reindex dataframe, save again

In [31]:
# SORT BY TDT!
df = new_meta

df['aorkey'] = df['aorkey'].astype(int)
df = df.sort_values(by=['aorkey'], ascending=True)
df = df.reset_index(drop=True)

In [32]:
df.head()

Unnamed: 0,aorkey,object_name,ra,dec,flux_units,file_path
0,3539200,HBC 356,60.808893,25.880773,Jy,spectra_normalized/3539200_renorm.pkl
1,3539456,LkCa 1,63.309814,28.317139,Jy,spectra_normalized/3539456_renorm.pkl
2,3539712,04108+2803A,63.472431,28.18739,Jy,spectra_normalized/3539712_renorm.pkl
3,3539968,MHO-3,63.627656,28.084989,Jy,spectra_normalized/3539968_renorm.pkl
4,3540224,Hubble 4,64.696616,28.333013,Jy,spectra_normalized/3540224_renorm.pkl


In [41]:
# Remove rows of objects not pickled (typically due to a data error).
bool_list = []
for path in df['file_path']:
    if os.path.isfile('../../' + path):
        bool_list.append(True)
    else:
        bool_list.append(False)

df = df.assign(data_ok=bool_list)
df = df.query('data_ok == True')

In [42]:
df.head()

Unnamed: 0,aorkey,object_name,ra,dec,flux_units,file_path,data_ok
0,3539200,HBC 356,60.808893,25.880773,Jy,spectra_normalized/3539200_renorm.pkl,True
1,3539456,LkCa 1,63.309814,28.317139,Jy,spectra_normalized/3539456_renorm.pkl,True
2,3539712,04108+2803A,63.472431,28.18739,Jy,spectra_normalized/3539712_renorm.pkl,True
3,3539968,MHO-3,63.627656,28.084989,Jy,spectra_normalized/3539968_renorm.pkl,True
4,3540224,Hubble 4,64.696616,28.333013,Jy,spectra_normalized/3540224_renorm.pkl,True


In [43]:
df.to_pickle('../metadata_step1_normalized.pkl')
print('Saved: ', '../metadata_step1_normalized.pkl')

Saved:  ../metadata_step1_normalized.pkl
