# M3: Lab to Targeted Conversion

In [None]:
import pandas as pd
from pathlib import Path
from collections import defaultdict

In [None]:
def findFolder(input_path, SEARCH_MSG):
    current_path = Path(input_path)
    if not current_path.exists(): return("Invalid Path")
    
    print(f"Current path: {current_path}")
    print(SEARCH_MSG)
    
    # list out folders with [#] as identifier
    folder_list = [f for f in current_path.iterdir() if f.is_dir()]
    print("> Folders in current folder:")
    if len(folder_list) == 0: print("[#] No folders found.")
    for c, i in enumerate(folder_list):
        print(f"[{c}] {i.name}")
    print("")

    choice = input().lower()
    while choice not in ["e", "s", "u"] + [str(i) for i in range(len(folder_list))]:
        choice = input("Couldn't read input, trying again. Target #: ").lower()

    if choice.lower() == "e": return("Exiting.")
    if choice.lower() == "s": return(current_path)
    if choice.lower() == "u": return(findFolder(current_path.parent, SEARCH_MSG))
    if choice.isnumeric() and int(choice) in range(len(folder_list)):
        current_path = current_path / folder_list[int(choice)]
        if current_path.is_dir():
            return(findFolder(current_path, SEARCH_MSG))

    return("Error reading choice, exiting.")

In [None]:
SEARCH_MSG_IN = \
    f"> Construct the path to the folder containing the csv files to be converted.\n" \
    f"> Save the current path to end search.\n" \
    f"> To select option [#], enter #.\n" \
    f"[E] Exit without saving\n" \
    f"[S] Save current path\n" \
    f"[U] Search up one level"

target_folder = findFolder(Path.cwd(), SEARCH_MSG_IN) # '../input_files/lunar_analog_spectra/Pyroxene lab spectra FDL'
file_paths_list = [f for f in target_folder.iterdir() if f.is_file() and f.suffix == ".csv"] 
print(f"# of file paths: {len(file_paths_list)}")
# file_paths_list

In [None]:
# assumed step size between m3 points 
# 10nm -> 0.01 microns = micrometers
STEPSIZE = 0.01

# boundaries for wavelengths, outside of which to ignore/truncate data
# re-determined with boundaries of m3 data
WL_MIN = 0.446
WL_MAX = 2.99

# TODO (in order of priority)
# !save binned xy pairs and cubic spline xy pairs to CSVs
# convert to python script file to run from cli
# modify to read in a folder of csvs and a target to bin to (m3 in original case)
# use variable (0.02 vs 0.04) step size based on micron range
# rename + recolor graph legend
# save graphs into subfolders based on lunar_analog_spectra folders (Iceland_JB501C graph goes into /Glass_volcanics)
# prompt user whether to save graphs

# step size until 1.57 microns
# STEPSIZE_PRE = 0.02
# step size after 1.57 microns 
# STEPSIZE_POST = 0.04

In [None]:
# boundaries: 
# m^3
# 0.46 microns - 460 nanometers
# 3 microns  - 3000 nanometers
# trailblazer
# 0.6 microns - 600 nanometers
# 3.6 microns - 3600 nanometers

# bin methods
# 1:1

In [None]:
file_paths_list

In [None]:
m3_path = Path('../input files/Clark m3 target wavelengths.csv').resolve()
lab_path = file_paths_list[0]

m3 = pd.read_csv(m3_path)
lab = pd.read_csv(lab_path)

In [None]:
# m3.head()
lab.head()

In [None]:
lab.describe()

In [None]:
# rename clark file to match
m3.columns = ['Wavelength (µm)']

x_m3 = (m3['Wavelength (µm)']).tolist()
x_m3 = [f"{wl:.5f}" for wl in x_m3]
print(f"# of m^3 wavelengths: {len(x_m3)}, first and last 10: \n{x_m3[0:10]}, \n{x_m3[-1-10:-1]}")

WL_MIN = float(x_m3[0])-STEPSIZE/2
WL_MAX = float(x_m3[-1])+STEPSIZE/2
print(f"Updated WL_MIN={WL_MIN} and WL_MAX={WL_MAX}")

In [None]:
FIRST_COLUMN = 'Wavelength (µm)'

if FIRST_COLUMN not in list(lab.columns):
    lab[FIRST_COLUMN] = lab['Wavelength (nm)']/1000
    lab.drop('Wavelength (nm)', axis = 'columns', inplace=True)

order = [FIRST_COLUMN] + [col for col in lab.columns if col != FIRST_COLUMN]
lab = lab[order]

# if 'Wavelength (µm)' not in lab.index.names:
#     lab = lab.set_index('Wavelength (µm)')
lab.columns = lab.columns.str.replace(",","")
lab.dropna(axis = 0, inplace=True) # for missing values in lab csv file
lab.head()

In [None]:
lab.describe()

In [None]:
y_column_label = lab.columns[1]
x_raw = list(lab[lab.columns[0]])
# x_raw rounding to 5 digits and recast to float (optional)
x_raw = [float(f"{wl:.5f}") for wl in x_raw]
y_raw = list(lab[y_column_label])

print(f"5 [x],[y] raw pairs: \nfirst\n{x_raw[0:5]}, \n{y_raw[0:5]} \nlast\n{x_raw[-1-5:-1]}, \n{y_raw[-1-5:-1]}")

In [None]:
# initialize bins stored as numerical strings
binned_raw = defaultdict(list)
for x in x_m3:
    binned_raw[x] = []
len(binned_raw)

In [None]:
# append points to bin with wavelength within half of stepsize
# average points across list
# precondition: wavelengths are sorted in ascending order

binCounter = 0
for x,y in zip(x_raw, y_raw):
    if x < WL_MIN or x > WL_MAX: continue

    lbound = float(x_m3[binCounter]) - STEPSIZE/2
    rbound = float(x_m3[binCounter]) + STEPSIZE/2

    # while point doesn't fit into current bin
    # increment binCounter unless not found
    while x > rbound: 
        if binCounter + 1 < len(x_m3):
            binCounter += 1

            # update bin bounds
            lbound = float(x_m3[binCounter]) - STEPSIZE/2
            rbound = float(x_m3[binCounter]) + STEPSIZE/2
        else:
            print(f"Point {x, y} within WL MINMAX range {WL_MIN, WL_MAX} but no bin found, last {lbound, rbound}")
            break
    
    # add point to bin
    binned_raw[x_m3[binCounter]].append((x,y))
    
print(f"bin index: {binCounter}, input list size: {len(x_raw)}")

In [None]:
# average all wavelength values in the same bin
x_avg = []
y_avg = []
for count, bin in enumerate(binned_raw):
    # print(bin, type(bin))
    if len(binned_raw[bin]) == 0:
        # x_avg.append(bin)
        # y_avg.append(-1)
        # print(f"Empty bin at {bin}")
        continue

    # average over all raw reflectance values
    avgx = 0
    avgy = 0
    for x,y in binned_raw[bin]:
        avgx += x
        avgy += y
    avgx /= len(binned_raw[bin])
    avgy /= len(binned_raw[bin])

    
    # take band center as WL coordinate to match to target 10nm
    x_avg.append(float(bin))

    # x_avg.append(avgx)
    y_avg.append(avgy)

print(f"# of average reflectance values {len(x_avg), len(y_avg)}, first and last averaged points {[(x_avg[0], y_avg[0]), (x_avg[-1], y_avg[-1])]}")

In [None]:
df_target = pd.DataFrame({"Wavelength (µm) target": x_avg, f"{lab.columns[1]} target": y_avg})
df_target = pd.concat((lab, df_target), axis = 1)
df_target.head()

In [None]:
# # drop bins based on range
# # x = wl
# # y = reflectance
# x_avg_drop = x_avg
# y_avg_drop = y_avg
# for c, (xi,yi) in enumerate(zip(x_avg, y_avg)):
#     if (x )
#     if (c+1) % 2 == 0:
#         x_avg_drop.remove(xi)
#         y_avg_drop.remove(yi)


# print(f"# of average reflectance values {len(x_avg), len(y_avg)}, first and last averaged points {[(x_avg[0], y_avg[0]), (x_avg[-1], y_avg[-1])]}")

In [None]:
# troubleshoot for cubic spline failing
# because x_avg is not sorted ascending
# occurs when lab data is being skipped
# due to bad WL_MIN, WL_MAX boundaries

# highest = 0
# for i in range(len(x_avg)):
#     try:
#         if float(x_avg[i]) < highest:
#             print(i, x_avg[i], "<", highest, x_avg[i-1])
#     except Exception as e:
#         print(e)
#         print(i, x_avg[i], highest, "excepting")
#     highest = float(x_avg[i])

In [None]:
# from scipy.interpolate import CubicSpline

# spline_points = len(x_avg)
# cs_avg = CubicSpline(x_avg, y_avg)

# x_spline_avg = list(np.linspace(WL_MIN, WL_MAX, spline_points))
# x_spline_avg = list(np.linspace(x_avg[0], x_avg[-1], spline_points))
# y_spline_avg = list(cs_avg(x_spline_avg))

# # len(x_spline_avg), len(y_spline_avg)

In [None]:
plot.rcParams['figure.figsize'] = (10,4)

ax = sns.scatterplot(
    x = x_raw,
    y = y_raw,
    label = f"{len(x_raw)} point raw data",
    color = 'blue',
    alpha = 0.25
)

ax = sns.scatterplot(
    x = x_avg,
    y = y_avg,
    label = f"{len(x_avg)} point averages of bin size {STEPSIZE}",
    color = "red",
    alpha = 0.5
)


for line in ax.lines:
    line.set_alpha(0.3)


plot.title(f"{lab_path.stem} Comparison") 
plot.xlabel('Wavelength (µm)')
plot.ylabel('Reflectance')

plot.xlim(WL_MIN, WL_MAX)
plot.tight_layout()
# plot.savefig(f'../output_files/graphs/test/{lab_path.stem} Comparison.png', bbox_inches='tight', dpi = 1000, transparent = False)
plot.show()

In [None]:
# match lengths of x_raw, x_avg
# prevents data from getting cut off when saving
# since zip function in writeFile() terminates
# on shortest array running out
count = 0
while len(x_raw) != len(x_avg):
    x_avg.append('')
    y_avg.append('')
    count += 1
if count > 0:
    print(f"Extended x,y avg {count} times to match x_raw")

In [None]:
def writeFile(path, mode): # x = new, w = overwrite
    output = open(path, mode)
    output.write(f"Wavelength (µm) raw,{y_column_label} raw,Wavelength (µm) target,{y_column_label} target\n")

    for x1, y1, x2, y2 in zip(x_raw, y_raw, x_avg, y_avg):
        output.write(f"{x1},{y1},{x2},{y2}\n")
    output.close()

In [None]:
lab_path

In [None]:
# def convert_path(input_path, old, new):
#     if not isinstance(input_path, Path):
#         input_path = Path(input_path)
    
#     parts = list(input_path.parts)
#     for c, i in enumerate(parts):
#         if i == old:
#             parts[c] = new
#     return Path(*parts)

# output_path = convert_path(lab_path, 'input files', 'output files')
# print(f"lab_path: {lab_path}" + "\n" + f"output_path: {output_path}")
# output_path = convert_path(output_path, 'lunar_analog_spectra', 'csvs')
# print(f"lab_path: {lab_path}" + "\n" + f"output_path: {output_path}")

In [None]:
# output_path = file_paths_list[0].replace('/lunar_analog_spectra/', '/csv output/')
print(f"path: {output_path}")
try:
    writeFile(output_path, 'x')
    print('file saved')
except FileExistsError:
    print('file present, overwrite? (y/n)')
    ans = input()
    while ans.lower() not in ['y', 'n', 'e']:
        ans = input()
    if ans.lower() == 'y': writeFile(output_path,'w')
except Exception as e:
    print(e)
    pass

In [None]:
SEARCH_MSG_OUT = \
    f"> Construct the path to the folder where the csv files will be saved.\n" \
    f"> Save the current path to end search.\n" \
    f"> To select option [#], enter #.\n" \
    f"[E] Exit without saving\n" \
    f"[S] Save current path\n" \
    f"[U] Search up one level"

output_folder = findFolder(Path.cwd(), SEARCH_MSG_OUT)
output_folder

In [None]:
# for f in file_paths_list:
if (output_folder / lab_path.name).exists():
    print(f"File path already exists, overwrite (Y/N)?")
    ans = input()
    while ans.lower() not in ["y", "n"]:
        ans = input()
    if ans.lower() == "y":
        df_target.to_csv(path_or_buf = output_folder/lab_path.name, index = False, na_rep = "")
else:
    df_target.to_csv(path_or_buf = output_folder/lab_path.name, index = False, na_rep = "")