## JSON to CSV

shb, reading all the data and generating one CSV for training and one for testing 


In [None]:
import os, json
import pandas as pd
from libs.edfa_feature_extraction_libs import featureExtraction_ML

shb_train_dir = '../dataset/ML_challenge_user/Train/shb'
shb_test_dir = '../dataset/ML_challenge_user/Test/shb'

shb_output_csv_train = '../Features/Train/shb/preamp_random_features_train.csv'
shb_output_csv_test = '../Features/Test/shb/preamp_random_features_test.csv'

# ---- HARDCODED MAPPING ----
edfa_name_to_index = {
    'rdm1-co1': 0,
    'rdm2-co1': 1,
    'rdm3-co1': 2,
    'rdm4-co1': 3,
    'rdm5-co1': 4,
    'rdm6-co1': 5,
    'rdm1-lg1': 6,
    'rdm2-lg1': 7
}
print("EDFA name to index mapping (hardcoded):")
for name, idx in edfa_name_to_index.items():
    print(f"  {name}: {idx}")

def extract_features_from_dir(data_dir, extractionType, channelType, featureType):
    all_features = pd.DataFrame()
    # print(sorted(os.listdir(data_dir)))
    for fname in sorted(os.listdir(data_dir)):
        # print("-----------------------" + featureType + ": " + fname)
        if not fname.endswith('.json'):
            continue
        file_path = os.path.join(data_dir, fname)
        
        with open(file_path, 'r') as f:
            data = json.load(f)
        if 'measurement_data' not in data or not data['measurement_data']:
            print(f"Skipping {fname}: no measurement_data")
            continue
        
        if 'preamp_' in fname:
            edfa_name = fname.split('preamp_')[1].split('_')[0]
        elif 'booster_' in fname:
            edfa_name = fname.split('booster_')[1].split('_')[0]
        else:
            edfa_name = fname.split('_')[1] if len(fname.split('_')) > 1 else fname
        features = featureExtraction_ML(
            data['measurement_data'],
            extractionType=extractionType,
            channelType=channelType,
            featureType=featureType,
            channelNum=data['measurement_setup'].get('roadm_wss_num_channel', 95)
        )
        features['edfa_index'] = edfa_name_to_index.get(edfa_name, -1)  # -1 if not found
        features['EDFA_type'] = extractionType
        
        # Move edfa_index to the first column
        cols = features.columns.tolist()
        cols.insert(0, cols.pop(cols.index('EDFA_type')))
        cols.insert(0, cols.pop(cols.index('edfa_index')))
        features = features[cols]
        all_features = pd.concat([all_features, features], ignore_index=True)
        print(f"Processed {fname}, edfa_name: {edfa_name}, edfa_index: {edfa_name_to_index.get(edfa_name, -1)}, rows: {len(features)}")
        # except Exception as e:
        #     print(f"Error processing {fname}: {e}")
    return all_features

# Extract features
shb_train_features = extract_features_from_dir(
    shb_train_dir, extractionType='preamp', channelType='random', featureType='train'
)
shb_test_features = extract_features_from_dir(
    shb_test_dir, extractionType='preamp', channelType='random', featureType='test'
)

shb_train_features.to_csv(shb_output_csv_train, index=False)
shb_test_features.to_csv(shb_output_csv_test, index=False)

Aging effect

In [None]:
import os, json
import pandas as pd
from libs.edfa_feature_extraction_libs import featureExtraction_ML

aging_train_dir = '../dataset/ML_challenge_user/Train/aging'
aging_test_dir = '../dataset/ML_challenge_user/Test/aging'

aging_output_csv_train = '../Features/Train/aging/preamp_random_features_train.csv'
aging_output_csv_test = '../Features/Test/aging/preamp_random_features_test.csv'

# ---- HARDCODED MAPPING ----
edfa_name_to_index = {
    'rdm1-co1': 0,
    'rdm2-co1': 1,
    'rdm3-co1': 2,
    'rdm4-co1': 3,
    'rdm5-co1': 4,
    'rdm6-co1': 5,
    'rdm1-lg1': 6,
    'rdm2-lg1': 7
}
print("EDFA name to index mapping (hardcoded):")
for name, idx in edfa_name_to_index.items():
    print(f"  {name}: {idx}")

def extract_features_from_dir(data_dir, featureType):
    all_features = pd.DataFrame()
    for fname in sorted(os.listdir(data_dir)):
        if not fname.endswith('.json'):
            continue
        file_path = os.path.join(data_dir, fname)
        print(file_path)
        with open(file_path, 'r') as f:
            data = json.load(f)
        if 'measurement_data' not in data or not data['measurement_data']:
            print(f"Skipping {fname}: no measurement_data")
            continue

        # Infer extractionType
        if 'preamp' in fname:
            extractionType_local = 'preamp'
        elif 'booster' in fname:
            extractionType_local = 'booster'
        else:
            print(f"Skipping {fname}: cannot determine extractionType")
            continue

        # Improved channelType detection
        if '_fix_' in fname or '_goalpost_' in fname:
            channelType_local = 'fix'
        elif '_random_' in fname:
            channelType_local = 'random'
        elif '_extraRandom_' in fname:
            channelType_local = 'extraRandom'
        elif '_extraLow_' in fname:
            channelType_local = 'extraLow'
        else:
            print(f"Skipping {fname}: cannot determine channelType")
            continue

        # Infer edfa_name
        if 'preamp_' in fname:
            edfa_name = fname.split('preamp_')[1].split('_')[0]
        elif 'booster_' in fname:
            edfa_name = fname.split('booster_')[1].split('_')[0]
        else:
            edfa_name = fname.split('_')[1] if len(fname.split('_')) > 1 else fname

        features = featureExtraction_ML(
            data['measurement_data'],
            extractionType=extractionType_local,
            channelType=channelType_local,
            featureType=featureType,
            # channelNum=data['measurement_setup'].get('roadm_wss_num_channel', 95)
        )
        features['edfa_index'] = edfa_name_to_index.get(edfa_name, -1)
        features['EDFA_type'] = extractionType_local
        # Move edfa_index to the first column
        cols = features.columns.tolist()
        cols.insert(0, cols.pop(cols.index('EDFA_type')))
        cols.insert(0, cols.pop(cols.index('edfa_index')))
        features = features[cols]
        all_features = pd.concat([all_features, features], ignore_index=True)
        print(f"Processed {fname}, extractionType: {extractionType_local}, channelType: {channelType_local}, edfa_name: {edfa_name}, edfa_index: {edfa_name_to_index.get(edfa_name, -1)}, rows: {len(features)}")
    return all_features

# For aging train and test
aging_train_features = extract_features_from_dir(
    aging_train_dir, featureType='train'
)
aging_test_features = extract_features_from_dir(
    aging_test_dir, featureType='test'
)
aging_train_features.to_csv(aging_output_csv_train, index=False)
aging_test_features.to_csv(aging_output_csv_test, index=False)

unseen gain and tilt feature extraction

In [None]:
import os, json
import pandas as pd
from libs.edfa_feature_extraction_libs import featureExtraction_ML

unseen_train_dir = '../dataset/ML_challenge_user/Train/unseen'
unseen_test_dir = '../dataset/ML_challenge_user/Test/unseen'

unseen_output_csv_train = '../Features/Train/unseen/booster_fix_features_train.csv'
unseen_output_csv_test = '../Features/Test/unseen/booster_goalpost_random_features_test.csv'

# ---- HARDCODED MAPPING ----
edfa_name_to_index = {
    'rdm1-co1': 0,
    'rdm2-co1': 1,
    'rdm3-co1': 2,
    'rdm4-co1': 3,
    'rdm5-co1': 4,
    'rdm6-co1': 5,
    'rdm1-lg1': 6,
    'rdm2-lg1': 7
}
print("EDFA name to index mapping (hardcoded):")
for name, idx in edfa_name_to_index.items():
    print(f"  {name}: {idx}")

def extract_features_from_dir(data_dir, featureType):
    all_features = pd.DataFrame()
    for fname in sorted(os.listdir(data_dir)):
        if not fname.endswith('.json'):
            continue
        file_path = os.path.join(data_dir, fname)
        with open(file_path, 'r') as f:
            data = json.load(f)
        if 'measurement_data' not in data or not data['measurement_data']:
            print(f"Skipping {fname}: no measurement_data")
            continue

        # Infer extractionType
        if 'preamp' in fname:
            extractionType_local = 'preamp'
        elif 'booster' in fname:
            extractionType_local = 'booster'
        else:
            print(f"Skipping {fname}: cannot determine extractionType")
            continue

        # Improved channelType detection
        if '_fix_' in fname or '_goalpost_' in fname:
            channelType_local = 'fix'
        elif '_random_' in fname:
            channelType_local = 'random'
        elif '_extraRandom_' in fname:
            channelType_local = 'extraRandom'
        elif '_extraLow_' in fname:
            channelType_local = 'extraLow'
        else:
            print(f"Skipping {fname}: cannot determine channelType")
            continue

        # Infer edfa_name
        if 'preamp_' in fname:
            edfa_name = fname.split('preamp_')[1].split('_')[0]
        elif 'booster_' in fname:
            edfa_name = fname.split('booster_')[1].split('_')[0]
        else:
            edfa_name = fname.split('_')[1] if len(fname.split('_')) > 1 else fname

        features = featureExtraction_ML(
            data['measurement_data'],
            extractionType=extractionType_local,
            channelType=channelType_local,
            featureType=featureType,
            channelNum=data['measurement_setup'].get('roadm_wss_num_channel', 95)
        )
        features['edfa_index'] = edfa_name_to_index.get(edfa_name, -1)
        features['EDFA_type'] = extractionType_local
        # Move edfa_index to the first column
        cols = features.columns.tolist()
        cols.insert(0, cols.pop(cols.index('EDFA_type')))
        cols.insert(0, cols.pop(cols.index('edfa_index')))
        features = features[cols]
        all_features = pd.concat([all_features, features], ignore_index=True)
        print(f"Processed {fname}, extractionType: {extractionType_local}, channelType: {channelType_local}, edfa_name: {edfa_name}, edfa_index: {edfa_name_to_index.get(edfa_name, -1)}, rows: {len(features)}")
    return all_features

# For aging train and test
unseen_train_features = extract_features_from_dir(
    unseen_train_dir, featureType='train'
)
unseen_test_features = extract_features_from_dir(
    unseen_test_dir, featureType='test'
)
 
unseen_train_features.to_csv(unseen_output_csv_train, index=False)
unseen_test_features.to_csv(unseen_output_csv_test, index=False)

COSMOS feature extraction

In [None]:
import os, json,fnmatch
import pandas as pd
from libs.edfa_feature_extraction_libs import featureExtraction_ML

data_prepath = '../dataset/COSMOS_EDFA_Dataset/'
output_csv_aging_train = '../Features/Train/COSMOS/COSMOS_features.csv'

# ---- HARDCODED MAPPING ----
edfa_name_to_index = {
    'rdm1-co1': 0,
    'rdm2-co1': 1,
    'rdm3-co1': 2,
    'rdm4-co1': 3,
    'rdm5-co1': 4,
    'rdm6-co1': 5,
    'rdm1-lg1': 6,
    'rdm2-lg1': 7
}
print("EDFA name to index mapping (hardcoded):")
for name, idx in edfa_name_to_index.items():
    print(f"  {name}: {idx}")

fileList = ['rdm1-co1', 'rdm2-co1', 'rdm3-co1', 'rdm4-co1',
            'rdm5-co1', 'rdm6-co1', 'rdm1-lg1', 'rdm2-lg1']
folderList = ['fix', 'random', 'extraRandom', 'extraLow']
edfaTypes = ["booster","preamp"]
gainList = ["15dB","18dB","21dB"]

def matchFile(pattern, foler):
    # match one file in the folder
    # example usage:
    # result = matchFile('*rdm1-co1*.json', '.../benchmark/extraRandom/')
    # result is the full path 
    for file in os.listdir(foler):
        if fnmatch.fnmatch(file, pattern):
            return os.path.join(foler, file)
            
def extract_features_from_dir(data_dir, featureType,edfaType):
    all_features = pd.DataFrame()
    for fileName in fileList:
        for channelType in folderList:
            for gain in gainList:
                filePath = data_prepath + edfaType + "/" + gain + "/" + channelType + "/"
                fname = matchFile('*'+fileName+'*.json', filePath)
                # file_path = os.path.join(data_dir, fname)
                with open(fname, 'r') as f:
                    data = json.load(f)
                if 'measurement_data' not in data or not data['measurement_data']:
                    print(f"Skipping {fname}: no measurement_data")
                    continue

                # Infer extractionType
                if 'preamp' in fname:
                    extractionType_local = 'preamp'
                elif 'booster' in fname:
                    extractionType_local = 'booster'
                else:
                    print(f"Skipping {fname}: cannot determine extractionType")
                    continue

                # Infer edfa_name
                edfa_name = fileName
                # if 'preamp_' in fname:
                #     edfa_name = fname.split('preamp_')[1].split('_')[0]
                # elif 'booster_' in fname:
                #     edfa_name = fname.split('booster_')[1].split('_')[0]
                # else:
                #     edfa_name = fname.split('_')[1] if len(fname.split('_')) > 1 else fname

                features = featureExtraction_ML(
                    data['measurement_data'],
                    extractionType=extractionType_local,
                    channelType=channelType,
                    featureType=featureType,
                    channelNum=data['measurement_setup'].get('roadm_wss_num_channel', 95)
                )

                features['edfa_index'] = edfa_name_to_index.get(edfa_name, -1)
                features['EDFA_type'] = extractionType_local
                # Move edfa_index to the first column
                cols = features.columns.tolist()
                cols.insert(0, cols.pop(cols.index('EDFA_type')))
                cols.insert(0, cols.pop(cols.index('edfa_index')))
                features = features[cols]
                all_features = pd.concat([all_features, features], ignore_index=True)
                print(f"Processed {fname}, extractionType: {extractionType_local}, channelType: {channelType}, edfa_name: {edfa_name}, edfa_index: {edfa_name_to_index.get(edfa_name, -1)}, rows: {len(features)}")
    return all_features

combined_dfs = []
# For aging train and test
# edfaTypes = ["preamp"]
for edfaType in edfaTypes: # 
    aging_train_features = extract_features_from_dir(
        data_prepath, featureType='train', edfaType=edfaType
    )
    # aging_test_features = extract_features_from_dir(
    #     aging_test_dir, featureType='test'
    # )
    # aging_testGround_features = extract_features_from_dir(
    #     aging_testGround_dir, featureType='test_ground_truth'
    # )   
    # aging_train_features.to_csv(output_csv_aging_train.replace('COSMOS_', edfaType+"_"), index=False)
    combined_dfs.append(aging_train_features)
# aging_test_features.to_csv(output_csv_aging_test, index=False)
# aging_testGround_features.to_csv(output_csv_aging_testGround, index=False)

combined = pd.concat(combined_dfs, ignore_index=True)
combined.to_csv(output_csv_aging_train, index=False)

split whole training files into train + test files 

In [None]:
import os
import pandas as pd
import numpy as np

def split_csv_features_labels(input_folder, output_folder_features, output_folder_labels):
    os.makedirs(output_folder_features, exist_ok=True)
    os.makedirs(output_folder_labels, exist_ok=True)
    for fname in os.listdir(input_folder):
        if not fname.endswith('.csv'):
            continue
        fpath = os.path.join(input_folder, fname)
        df = pd.read_csv(fpath)
        # Columns containing 'gain' (case-insensitive)
        # Find columns for WSS activated channel index and calculated gain spectra
        wss_cols = [col for col in df.columns if 'dut_wss_activated_channel_index' in col.lower()]

        label_cols = [col for col in df.columns if 'calculated_gain_spectra' in col.lower()]
        feature_cols = [col for col in df.columns if col not in label_cols]
        # Ensure the columns are sorted in the same order for elementwise multiplication
        label_cols_sorted = sorted(label_cols)
        wss_cols_sorted = sorted(wss_cols)
        # Check that the number of label and wss columns match for elementwise multiplication
        if len(label_cols_sorted) != len(wss_cols_sorted):
            raise ValueError(f"Number of label columns ({len(label_cols_sorted)}) does not match number of WSS columns ({len(wss_cols_sorted)}) in file {fname}")
        # Elementwise multiply the label columns by the WSS columns row-wise
        
        # Set label to np.nan where WSS is 0, keep label where WSS is 1
        mask = df[wss_cols_sorted].values == 1
        df[label_cols_sorted] = np.where(mask, df[label_cols_sorted].values, np.nan)
        # Save
        df[feature_cols].to_csv(os.path.join(output_folder_features, fname.replace('.csv', '_features.csv')), index=False)
        df[label_cols_sorted].to_csv(os.path.join(output_folder_labels, fname.replace('.csv', '_labels.csv')), index=False)
        print(f"Processed {fname}: {len(feature_cols)} features, {len(label_cols)} labels.")

# All combinations for Train, shb, unseen, aging
train_folders = [
    ('../Features/Train/shb', '../Features/Train/shb/features', '../Features/Train/shb/labels'),
    ('../Features/Train/unseen', '../Features/Train/unseen/features', '../Features/Train/unseen/labels'),
    ('../Features/Train/aging', '../Features/Train/aging/features', '../Features/Train/aging/labels'),
    ('../Features/Train/COSMOS', '../Features/Train/COSMOS/features', '../Features/Train/COSMOS/labels')
]

for input_folder, output_features, output_labels in train_folders:
    if os.path.exists(input_folder):
        split_csv_features_labels(input_folder, output_features, output_labels)
    else:
        print(f"[WARNING] Input folder does not exist: {input_folder}")


In [None]:
import os
import shutil
# Move all *_features.csv files from the first folder to the second folder for each tuple in folders

test_move_folder = [ 
    ('../Features/Test/shb', '../Features/Test/shb/features'),
    ('../Features/Test/unseen', '../Features/Test/unseen/features'),
    ('../Features/Test/aging', '../Features/Test/aging/features'),
]

for src_folder, dst_folder in test_move_folder: 
    # os.makedirs(dst_folder, exist_ok=True)
    for fname in os.listdir(src_folder):
        if fname.endswith('.csv'):
            # Rename to _features.csv if not already
            if not fname.endswith('_features.csv'):
                base, _ = os.path.splitext(fname)
                new_fname = base + '_features.csv'
            else:
                new_fname = fname
            src_path = os.path.join(src_folder, fname)
            dst_path = os.path.join(dst_folder, fname)
            shutil.move(src_path, dst_path)
            print(f"Moved {src_path} -> {dst_path}")



Combine all training and test set together and convert it into kaggle style

In [None]:
import glob
import os
import pandas as pd

def combine_csvs_with_id_usage(input_dirs, output_csv, add_id_usage=False, add_category=True):
    """
    Combine CSVs from input_dirs, optionally add ID and Usage columns, 
    and save to output_csv.
    """
    combined = []
    for input_dir in input_dirs:
        category = os.path.basename(os.path.dirname(input_dir))  # e.g. "aging", "shb", "unseen"
        csv_files = sorted(glob.glob(os.path.join(input_dir, "*.csv")))
        for csv_file in csv_files:
            df = pd.read_csv(csv_file)
            df.fillna(0, inplace=True)
            n_rows = df.shape[0]

            # Add optional ID & Usage
            if add_id_usage:
                half = n_rows // 2
                usage_col = ["Public"] * half + ["Private"] * (n_rows - half)
                df.insert(0, "Usage", usage_col)
                if not hasattr(combine_csvs_with_id_usage, "global_id_counter"):
                    combine_csvs_with_id_usage.global_id_counter = 1
                start_id = combine_csvs_with_id_usage.global_id_counter
                df.insert(0, "ID", list(range(start_id, start_id + n_rows)))
                combine_csvs_with_id_usage.global_id_counter += n_rows

            if add_category:
                df.insert(2 if add_id_usage else 0, "Category", [category] * n_rows)
                
            combined.append(df)

    if combined:
        final_df = pd.concat(combined, ignore_index=True)
        final_df.to_csv(output_csv, index=False)
        print(f"Combined CSV saved to {output_csv} with {final_df.shape[0]} rows.")
    else:
        print(f"No CSVs found in {input_dirs}")

    # Reset counter for next call
    if hasattr(combine_csvs_with_id_usage, "global_id_counter"):
        delattr(combine_csvs_with_id_usage, "global_id_counter")

# Define input directories for features and labels
# Combine feature directories for Train
train_feature_dirs = [
    '../Features/Train/aging/features',
    '../Features/Train/shb/features',
    '../Features/Train/unseen/features',
    # '../Features/Train/COSMOS/features'
]

train_label_dirs = [
    '../Features/Train/aging/labels',
    '../Features/Train/shb/labels',
    '../Features/Train/unseen/labels',
    # '../Features/Train/COSMOS/labels'
]

# Combine features (all Usage = Public)
combine_csvs_with_id_usage(
    train_feature_dirs,
    "../Features/Train/train_features.csv", 
	add_id_usage=False,
    add_category=True
)

combine_csvs_with_id_usage(
    train_label_dirs,
    "../Features/Train/train_labels.csv",
    add_id_usage=False,
    add_category=False
)

test_feature_dirs = [
    '../Features/Test/aging/features',
    '../Features/Test/shb/features',
    '../Features/Test/unseen/features'
]

combine_csvs_with_id_usage(
    test_feature_dirs,
    "../Features/Test/test_features.csv", 
	add_id_usage=True,
    add_category=True
)


In [None]:
COSMOS_feature_dirs = [
    '../Features/Train/COSMOS/features'
]

COSMOS_label_dirs = [
    '../Features/Train/COSMOS/labels'
]

combine_csvs_with_id_usage(
    COSMOS_feature_dirs,
    "../Features/Train/COSMOS_features.csv", 
	add_id_usage=False,
    add_category=True
)

combine_csvs_with_id_usage(
    COSMOS_label_dirs,
    "../Features/Train/COSMOS_labels.csv",
    add_id_usage=False,
    add_category=False
)