## Identifying and resolving conflicts in a batch of VI files

In [1]:
import os, sys, glob
import numpy as np

from astropy.io import fits
from astropy.table import Table, join, vstack
from astropy.io import fits

import pandas as pd
import fnmatch

import desispec.io

#import desispec
# library location will change ..
sys.path.append("/global/homes/r/rtojeiro/prospect/prospect/py")
from prospect import utils_specviewer,plotframes
import matplotlib.pyplot as plt 

In [2]:
tiledir   = '/global/cfs/cdirs/desi/spectro/redux/daily/tiles/'
tiles = ['68002']
petals = ['0','1', '2', '3', '4', '5', '6' ,'7', '8', '9']
obs_db = utils_specviewer.make_targetdict(tiledir, petals=petals, tiles=tiles) # tiles = optional 

In [3]:
pd.set_option('display.max_rows', 20)

In [4]:
#set to directory with all the VI files to merge
VI_dir = os.environ['HOME']+'/SV/VI_files/SV0/LRG/'

In [5]:
#we will read all the *.csv files in this directory. Change as needed.

all_files = os.listdir(VI_dir)
vi_files=[]

pattern = "desi*.csv"
for entry in all_files:
    if fnmatch.fnmatch(entry, pattern):
            vi_files.append(entry)
            
vi_files

['desi-vi_SV0_LRG_tile68002_night20200315_2_KSD.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_8_KSD.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_3_KSD.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_5_KSD.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_6_KSD.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_4_KSD.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_2_RT.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_7_KSD.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_9_KSD.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_1_RT.csv',
 'desi-vi_SV0_LRG_tile68002_night20200315_1_KSD.csv']

In [6]:
vi = pd.read_csv(VI_dir + vi_files[0], delimiter = " , ", engine='python')

for i in range(1,len(vi_files)):
    print(vi_files[i])
    vi2 = pd.read_csv(VI_dir + vi_files[i], delimiter = " , ", engine='python')
    vi = vi.append(vi2, ignore_index=True)
    
#make groups of visual inspections, grouped by unique objects, and state number of single and multiple VIs
vi_gp = vi.groupby(['TargetID'])
print('There are ' + str(len(vi)) + ' visual inspections of a total of ' + str(len(vi_gp)) + ' unique objects')

desi-vi_SV0_LRG_tile68002_night20200315_8_KSD.csv
desi-vi_SV0_LRG_tile68002_night20200315_3_KSD.csv
desi-vi_SV0_LRG_tile68002_night20200315_5_KSD.csv
desi-vi_SV0_LRG_tile68002_night20200315_6_KSD.csv
desi-vi_SV0_LRG_tile68002_night20200315_4_KSD.csv
desi-vi_SV0_LRG_tile68002_night20200315_2_RT.csv
desi-vi_SV0_LRG_tile68002_night20200315_7_KSD.csv
desi-vi_SV0_LRG_tile68002_night20200315_9_KSD.csv
desi-vi_SV0_LRG_tile68002_night20200315_1_RT.csv
desi-vi_SV0_LRG_tile68002_night20200315_1_KSD.csv
There are 549 visual inspections of a total of 450 unique objects


In [7]:
#vi is a dataframe
vi

Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment
0,35185929937225680,-1,0,0,GALAXY,0.5714,ksd,4,--,0.5714,GALAXY,--
1,35185929941418304,-1,0,0,GALAXY,0.6066,ksd,4,--,0.6066,GALAXY,--
2,35185929941418635,-1,0,0,GALAXY,0.8688,ksd,4,--,0.8688,GALAXY,--
3,35185929941418675,-1,0,0,GALAXY,0.3265,ksd,4,--,0.3265,GALAXY,--
4,35185929941419081,-1,0,0,GALAXY,1.1142,ksd,4,--,1.1142,GALAXY,--
5,35185929941419330,-1,0,0,GALAXY,1.0314,ksd,3,--,1.0314,GALAXY,--
6,35185929941419549,-1,0,0,GALAXY,0.5586,ksd,4,--,0.5586,GALAXY,--
7,35185929941419731,-1,0,0,GALAXY,1.0399,ksd,4,--,1.0399,GALAXY,--
8,35185929941419985,-1,0,0,GALAXY,0.5775,ksd,4,--,0.5775,GALAXY,--
9,35185929941420331,-1,0,0,GALAXY,0.7123,ksd,4,--,0.7123,GALAXY,--


In [8]:
vi.keys()

Index(['TargetID', 'ExpID', 'Spec version', 'Redrock version',
       'Redrock spectype', 'Redrock z', 'VI scanner', 'VI class', 'VI issue',
       'VI z', 'VI spectype', 'VI comment'],
      dtype='object')

In [9]:
#make new column with best redshift estimate for each VI - take VI redshift if available, else take Redrock redshift. 
#I am always assuming that the VI redshift, if provided, trumps over the Redrock redshift. 
vi['best redshift'] = vi['VI z']
vi.loc[vi['best redshift']=='--', 'best redshift'] = vi.loc[vi['best redshift']=='--', 'Redrock z']
vi['best redshift'] = vi['best redshift'].astype(float)

In [10]:
vi

Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift
0,35185929937225680,-1,0,0,GALAXY,0.5714,ksd,4,--,0.5714,GALAXY,--,0.5714
1,35185929941418304,-1,0,0,GALAXY,0.6066,ksd,4,--,0.6066,GALAXY,--,0.6066
2,35185929941418635,-1,0,0,GALAXY,0.8688,ksd,4,--,0.8688,GALAXY,--,0.8688
3,35185929941418675,-1,0,0,GALAXY,0.3265,ksd,4,--,0.3265,GALAXY,--,0.3265
4,35185929941419081,-1,0,0,GALAXY,1.1142,ksd,4,--,1.1142,GALAXY,--,1.1142
5,35185929941419330,-1,0,0,GALAXY,1.0314,ksd,3,--,1.0314,GALAXY,--,1.0314
6,35185929941419549,-1,0,0,GALAXY,0.5586,ksd,4,--,0.5586,GALAXY,--,0.5586
7,35185929941419731,-1,0,0,GALAXY,1.0399,ksd,4,--,1.0399,GALAXY,--,1.0399
8,35185929941419985,-1,0,0,GALAXY,0.5775,ksd,4,--,0.5775,GALAXY,--,0.5775
9,35185929941420331,-1,0,0,GALAXY,0.7123,ksd,4,--,0.7123,GALAXY,--,0.7123


In [11]:
#add new columns, holding the mean of the flags and the maximum difference in flag classification
vi['vi_combined_flag'] = vi.groupby('TargetID')['VI class'].transform('mean')
vi['vi_diff'] = vi.groupby('TargetID')['VI class'].transform(lambda x: ( x.max()-x.min()) )

In [12]:
#add new column, with the mean redshift from all values of 'best redshift'
vi['vi_combined_z'] = vi.groupby('TargetID')['best redshift'].transform('mean')
vi['dz'] = vi.groupby('TargetID')['best redshift'].transform(lambda x: ( (x.max() - x.min()) / (1+x.min()) ))

Get a table that holds only the objects that have been inspected more than once, and for which the individual VI classifications differ by 2 or more, or delta z / (1 + z) > 0.0033 (these are the conflicts to resolve)

In [13]:
vi_conflict = vi_gp.filter(lambda x: ( ( (x['VI class'].max()-x['VI class'].min()) >= 2) 
                       | ( (x['best redshift'].max() - x['best redshift'].min()) / (1+x['best redshift'].min()) > 0.0033 ) )
                       & (len(x) >= 2)) #x is a group by TargetID

Get the target IDs of the problematic objects and display in table form for a quick summary:

In [14]:
unique_targets = np.unique(vi_conflict['TargetID'].tolist())
print('Targets with problematic VI: ', unique_targets)
print('Total number of conflicts to resolve: ', len(unique_targets))

Targets with problematic VI:  [35185923939371905 35185923939373860 35185923943564221 35185923943564344
 35185923951955494 35185929933031946 35185929937224636]
Total number of conflicts to resolve:  7


In [15]:
for i in range(len(unique_targets)): 
    display(vi[vi.TargetID==unique_targets[i]])

Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
453,35185923939371905,-1,0.33.0.dev3890,0,GALAXY,1.0205,RT,2,S,0.5048,--,"fit to HK"","" possible issue with continuum in...",0.5048,2.5,1,0.76265,0.342703
502,35185923939371905,-1,0,0,GALAXY,1.0205,ksd,3,--,1.0205,GALAXY,--,1.0205,2.5,1,0.76265,0.342703


Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
456,35185923939373860,-1,0.33.0.dev3890,0,GALAXY,1.6997,RT,1,RC,1.216,--,QSO?,1.216,2.0,2,1.214,0.001808
505,35185923939373860,-1,0,0,GALAXY,1.6997,ksd,3,RC,1.212,QSO,--,1.212,2.0,2,1.214,0.001808


Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
460,35185923943564221,-1,0.33.0.dev3890,0,GALAXY,0.8969,RT,2,--,--,--,--,0.8969,3.0,2,0.8969,0.0
509,35185923943564221,-1,0,0,GALAXY,0.8969,ksd,4,--,0.8969,GALAXY,--,0.8969,3.0,2,0.8969,0.0


Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
461,35185923943564344,-1,0.33.0.dev3890,0,QSO,0.802,RT,1,R,--,--,--,0.802,1.5,1,1.111,0.342952
510,35185923943564344,-1,0,0,QSO,0.802,ksd,2,--,1.42,GALAXY,--,1.42,1.5,1,1.111,0.342952


Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
484,35185923951955494,-1,0.33.0.dev3890,0,GALAXY,0.8556,RT,2,--,--,--,--,0.8556,3.0,2,0.8556,0.0
534,35185923951955494,-1,0,0,GALAXY,0.8556,ksd,4,--,0.8556,GALAXY,--,0.8556,3.0,2,0.8556,0.0


Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
29,35185929933031946,-1,0,0,GALAXY,0.7492,ksd,4,--,0.7492,GALAXY,--,0.7492,3.0,2,0.7492,0.0
329,35185929933031946,-1,0.33.0.dev3890,0,GALAXY,0.7492,RT,2,--,--,--,--,0.7492,3.0,2,0.7492,0.0


Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
41,35185929937224636,-1,0,0,QSO,1.0286,ksd,1,--,0.0086,STAR,"lots of signal"","" but I can't identify object",0.0086,1.5,1,0.5186,1.011303
341,35185929937224636,-1,0.33.0.dev3890,0,QSO,1.0286,RT,2,C,--,GALAXY,--,1.0286,1.5,1,0.5186,1.011303


## This is where I resolve things manually - with care!!
### I think it's better to keep it in a notebook, as typos can be backtracked rather than a single manual edit of a text file

We edit either 'VI class', or 'best redshift' to resolve conflict. At the end, we look for conflicts again and we should find none.


In [16]:
#function to display the conflict in table format and open a prospect window
def display_conflict(conflict_id):
    
    #first, remind myself of the problem:
    display(vi[vi.TargetID==unique_targets[conflict_id]])

    spectra, zcat= utils_specviewer.load_spectra_zcat_from_targets([unique_targets[conflict_id]], tiledir, obs_db)
    # VI interface in notebook
    plotframes.plotspectra(spectra, zcatalog=zcat, title='Target_select', notebook=True, mask_type='CMX_TARGET',with_vi_widgets=False)

In [17]:
#first, keep a safe copy of the original dataframe
vi_safe = vi.copy()

We will inspect each conflict on a prospect window, and resolve each conflict in turn

In [18]:
#Keep track of the conflicts by conflict_id
conflict_id=0
display_conflict(conflict_id)

Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
453,35185923939371905,-1,0.33.0.dev3890,0,GALAXY,1.0205,RT,2,S,0.5048,--,"fit to HK"","" possible issue with continuum in...",0.5048,2.5,1,0.76265,0.342703
502,35185923939371905,-1,0,0,GALAXY,1.0205,ksd,3,--,1.0205,GALAXY,--,1.0205,2.5,1,0.76265,0.342703


In [19]:
#now I change either 'best redshift' or 'vi class'  to resolve conflict
#in this case, the two inspectors disagree on the correct redshift. I will take the original redshift, but lower the confidence flag.
vi.loc[vi.TargetID==unique_targets[conflict_id], 'best redshift'] = 1.0205
vi.loc[vi.TargetID==unique_targets[conflict_id], 'VI class'] = 2


#look at the values again to make sure all is well
display(vi[vi.TargetID==unique_targets[conflict_id]])


Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
453,35185923939371905,-1,0.33.0.dev3890,0,GALAXY,1.0205,RT,2,S,0.5048,--,"fit to HK"","" possible issue with continuum in...",1.0205,2.5,1,0.76265,0.342703
502,35185923939371905,-1,0,0,GALAXY,1.0205,ksd,2,--,1.0205,GALAXY,--,1.0205,2.5,1,0.76265,0.342703


In [20]:
#next one!
conflict_id=1
display_conflict(conflict_id)

Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
456,35185923939373860,-1,0.33.0.dev3890,0,GALAXY,1.6997,RT,1,RC,1.216,--,QSO?,1.216,2.0,2,1.214,0.001808
505,35185923939373860,-1,0,0,GALAXY,1.6997,ksd,3,RC,1.212,QSO,--,1.212,2.0,2,1.214,0.001808


In [21]:
#in this case, the two inspectors agree that the pipeline is wrong and identify the same alternative redshift. The disagreement is in the flag. 
#I happen to know one of the inspectors is very wobbly on QSO stuff ( ;) ), so I will take the confidence flag of the expert.
vi.loc[vi.TargetID==unique_targets[conflict_id], 'VI class'] = 3

#look at the values again to make sure all is well
display(vi[vi.TargetID==unique_targets[conflict_id]])

Unnamed: 0,TargetID,ExpID,Spec version,Redrock version,Redrock spectype,Redrock z,VI scanner,VI class,VI issue,VI z,VI spectype,VI comment,best redshift,vi_combined_flag,vi_diff,vi_combined_z,dz
456,35185923939373860,-1,0.33.0.dev3890,0,GALAXY,1.6997,RT,3,RC,1.216,--,QSO?,1.216,2.0,2,1.214,0.001808
505,35185923939373860,-1,0,0,GALAXY,1.6997,ksd,3,RC,1.212,QSO,--,1.212,2.0,2,1.214,0.001808


### and so on...

We should now recompute the conflicts, and not find any (except I didn't resolve everything! but the number of conflicts should now be 5, not 7)

In [22]:
vi_gp = vi.groupby(['TargetID'])
vi_conflict = vi_gp.filter(lambda x: ( ( (x['VI class'].max()-x['VI class'].min()) >= 2) 
                       | ( (x['best redshift'].max() - x['best redshift'].min()) / (1+x['best redshift'].mean()) > 0.0033 ) )
                       & (len(x) >= 2)) #x is a group by TargetID

In [23]:
unique_targets = np.unique(vi_conflict['TargetID'].tolist())
print('Targets with problematic VI: ', unique_targets)
print('Total number of conflicts to resolve: ', len(unique_targets))

Targets with problematic VI:  [35185923943564221 35185923943564344 35185923951955494 35185929933031946
 35185929937224636]
Total number of conflicts to resolve:  5


And we need to recompute vi_combined_flag and vi_combined_z, after everything is resolved.

In [24]:
vi['vi_combined_flag'] = vi.groupby('TargetID')['VI class'].transform('mean')
vi['vi_combined_z'] = vi.groupby('TargetID')['best redshift'].transform('mean')


## Now we prepare to write to file. 

### The important columns for the truth table construction are **vi_combined_flag** and **vi_combined_z** 

The truth table should **take the redhift value in vi_combined_z** for all objects with **vi_combined_flag >= 2.5** (for discussion, but this catches the cases where there is some disagreement on the 2/3 boundary).

After the merging process, those two columns will be the same for ALL of the members of each group by TargetID, so it doesn't matter which member we write to file. Taking the 1st one just makes this easy. 


In [25]:
vi_gp[ 'Redrock spectype', 'best redshift', 'vi_combined_flag'].first().to_csv(VI_dir+'merged_VI.txt')