In [1]:
import pandas as pd
import operator
from pathlib import Path
import fiftyone as fo

IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224


def create_diff_columns(merged_data) -> pd.DataFrame:
    # Compare the bounding box coordinates
    # x1_pred towards right => x1_train < x1_pred, implies smaller bbox, flip diff_x1 sign
    merged_data["diff_x1"] = round(merged_data["x1_pred"] - merged_data["x1_train"], 4)
    # y1_pred below => y1_train < y1_pred, implies smaller bbox, flip diff_y1 sign
    merged_data["diff_y1"] = round(merged_data["y1_pred"] - merged_data["y1_train"], 4)
    # x2_pred towards left => x2_train > x2_pred, implies smaller bbox
    merged_data["diff_x2"] = round(merged_data["x2_train"] - merged_data["x2_pred"], 4)
    # y2_pred above => y2_train > y2_pred, implies smaller bbox
    merged_data["diff_y2"] = round(merged_data["y2_pred"] - merged_data["y2_train"], 4)

    return merged_data


def compare_bounding_boxes(merged_data, op=operator.gt, threshold=1) -> pd.DataFrame:

    # Compare the bounding box coordinates
    merged_data = create_diff_columns(merged_data)

    # Filter the rows where the difference between coordinates is greater than 1
    filtered_data = merged_data[
        op(1 * (merged_data["diff_x1"]), threshold)
        | op(1 * (merged_data["diff_y1"]), threshold)
        | op(1 * (merged_data["diff_x2"]), threshold)
        | op(1 * (merged_data["diff_y2"]), threshold)
    ]

    return filtered_data


def col_names_parser(columns):
    # Extract the column names from the list of columns
    if isinstance(columns, list):
        literal_columns = ["x1", "y1", "x2", "y2", "id"]
        columns = [col for col in columns if any(literal in col for literal in literal_columns)]
    return columns


def convert_to_bbox_coco(data) -> pd.DataFrame:
    # Convert the dataframe to a COCO format
    idx = data.columns.get_loc("x2_train")
    # Update the bounding box coordinates to be positive
    data["x1_pred"] = data["x1_pred"].apply(lambda x: max(x, 0))
    data["y1_pred"] = data["y1_pred"].apply(lambda x: max(x, 0))
    # Insert the width and height columns (train)
    data.insert(idx, "height_train", round(data["y2_train"] - data["y1_train"], 3))
    data.insert(idx, "width_train", round(data["x2_train"] - data["x1_train"], 3))
    # Insert the width and height columns (pred)
    data["width_pred"] = round(data["x2_pred"] - data["x1_pred"], 3)
    data["height_pred"] = round(data["y2_pred"] - data["y1_pred"], 3)
    # Drop the columns that are not needed
    data.drop(["x2_train", "y2_train", "x2_pred", "y2_pred"], axis=1, inplace=True)
    data["id"] = data.index
    # data = data[["id", "file_name", "x1", "y1", "width", "height", "category_id", "payor_name", "planprovider_name"]]
    return data


def convert_to_fityone_dataset(data: pd.DataFrame):
    # Use dataframe after bbox coco conversion

    # Create a FiftyOne dataset
    dataset = fo.Dataset(name=data.name)

    # Modify the filepath to be the absolute path
    images_dir = Path("/Users/leonardpimentel/Downloads/kaggle/datasets/aft-vbi-pds/bin-images")

    # Function to create a FiftyOne sample from a row in the DataFrame
    def create_sample(row):
        sample = fo.Sample(filepath=str(images_dir / row["image_id"]))

        # Create a label, assume it's classification for this example
        sample["id"] = row["id"]
        sample["gt_caption"] = fo.Classification(label=row["caption"])

        """
        # Assigning a detection
        bbox = [
            row["x1_train"] / IMAGE_WIDTH,
            row["y1_train"] / IMAGE_HEIGHT,
            row["width_train"] / IMAGE_WIDTH,
            row["height_train"] / IMAGE_HEIGHT,
        ]
        detection = fo.Detection(label=row["member_id"], bounding_box=bbox)
        sample["gt_bbox"] = fo.Detections(detections=[detection])

        p_bbox = [
            row["x1_pred"] / IMAGE_WIDTH,
            row["y1_pred"] / IMAGE_HEIGHT,
            row["width_pred"] / IMAGE_WIDTH,
            row["height_pred"] / IMAGE_HEIGHT,
        ]
        detection = fo.Detection(
            label=row["member_id"], bounding_box=p_bbox, confidence=row["planprovider_confidence"]
        )
        sample["pred_bbox"] = fo.Detections(detections=[detection])

        return sample
        """

    # Iterate over DataFrame rows
    for _, row in data.iterrows():
        sample = create_sample(row)
        dataset.add_sample(sample)
    return dataset

In [2]:
# Reference the data directory
data_path = Path.cwd().parent.parent / "data"
print(data_path)

/Users/leopimentel/developer/ML/InsuranceTechV2.5/data


In [4]:
import pandas as pd

training_file = Path(data_path / "training_concat_payor_parent.csv")
gt_file = pd.read_csv(training_file)
print(f"Training set - {gt_file.shape[0]:,d} samples")
print(gt_file.columns)
gt_file.head(2)

Training set - 2,141,534 samples
Index(['filename', 'planprovider', 'x1', 'y1', 'x2', 'y2', 'memberId', 's3Key',
       'updatedOn', 'sourceFile', 'accuracy', 'pp_name', 'payor_cd'],
      dtype='object')


Unnamed: 0,filename,planprovider,x1,y1,x2,y2,memberId,s3Key,updatedOn,sourceFile,accuracy,pp_name,payor_cd
0,4931668071814610-1752299282-2022-11-10-14-16-5...,BS,10.0,92.0,53.0,101.0,YPI10129454601,4931668071814610/1752299282/2022-11-10-14-16-5...,2023-12-19 17:12:33.490994,unknownFile.txt,,BlueCrossBlueShield,BSNCA
1,7051490978155771-77267962-2019-09-24-21-46-43-...,UNVSAL,21.0,86.0,105.0,99.0,W235559835,7051490978155771/77267962/2019-09-24-21-46-43/...,2023-12-19 17:12:30.577154,Horizon_PaidClaims_20210929032723.txt,,BlueCrossBlueShield,ZPRME


In [3]:
# Read the test data file
infer_filename = Path(data_path / "test_concat_payor_parent.csv")
gt_inf_file = pd.read_csv(infer_filename)
gt_inf_file.head(2)

Unnamed: 0,filename,member_id,planprovider_name,payor_name,planprovider_encoded,payor_encoded,x1,y1,x2,y2
0,1121669203426123-1754512169-2022-11-23-16-37-0...,MKGAN3671969,BlueCrossBlueShield,BSNCA,0,0,12.0,91.0,62.0,100.0
1,5331638810196106-1557644079-2022-07-07-16-43-3...,8VR1CT0TN11,Medicare,MCWAL,2,261,8.0,123.0,83.0,143.0


In [8]:
cl_infer_2e4_path = Path(
    data_path
    / "CL_INFER_2103M_Model_CL_5_ONE_OPT_IOU_FIX_NO_WARM_ADAM_LR_0.0002_CR_0.5_BBLR_1.8e-05_B16_testrun4_Best_Model_1.csv"
)
cl_infer_2e4 = pd.read_csv(cl_infer_2e4_path)
cl_infer_2e4.name = (
    "2103M_Model_CL_5_ONE_OPT_IOU_FIX_NO_WARM_ADAM_LR_0.0002_CR_0.5_BBLR_1.8e-05_B16_testrun4_Best_Model_1"
)
print(f"Test set {cl_infer_2e4.shape[0]:,d} samples\n")
cl_infer_2e4.head(2)

Test set 428,307 samples



Unnamed: 0,filename,planprovider_name,planprovider_encoded,payor_name,payor_encoded,planprovider_confidence,payor_confidence,x1,y1,x2,y2
0,1121669203426123-1754512169-2022-11-23-16-37-0...,BlueCrossBlueShield,0,NVANT,16,0.999,0.151,12.48,90.87,61.87,100.01
1,5331638810196106-1557644079-2022-07-07-16-43-3...,Medicare,2,MCNJL,259,1.0,0.249,8.1,122.48,82.57,142.79


In [9]:
filtered_data_2e4_Best = compare_provider_and_payor(gt_inf_file, cl_infer_2e4)
filtered_data_2e4_Best.head(2)

Unnamed: 0,filename,member_id,pp_name,payor_cd,planprovider_encoded_x,payor_encoded_x,x1_train,y1_train,x2_train,y2_train,...,payor_name,payor_encoded_y,pp_match,payor_match,planprovider_confidence,payor_confidence,x1_pred,y1_pred,x2_pred,y2_pred
0,1121669203426123-1754512169-2022-11-23-16-37-0...,MKGAN3671969,BlueCrossBlueShield,BSNCA,0,0,12.0,91.0,62.0,100.0,...,NVANT,16,True,False,0.999,0.151,12.48,90.87,61.87,100.01
1,5331638810196106-1557644079-2022-07-07-16-43-3...,8VR1CT0TN11,Medicare,MCWAL,2,261,8.0,123.0,83.0,143.0,...,MCNJL,259,True,False,1.0,0.249,8.1,122.48,82.57,142.79


In [10]:
filtered_data_2e4_Best_coco = convert_to_bbox_coco(filtered_data_2e4_Best)
filtered_data_2e4_Best_coco.name = "2103M_Model_CL_5_0.0002_CR_0.5_BBLR_1.8e-05_B16_testrun4_Best_Model_1"
filtered_data_2e4_Best_coco.head(2)
filtered_data_2e4_Best_coco

Unnamed: 0,filename,member_id,pp_name,payor_cd,planprovider_encoded_x,payor_encoded_x,x1_train,y1_train,width_train,height_train,...,payor_encoded_y,pp_match,payor_match,planprovider_confidence,payor_confidence,x1_pred,y1_pred,width_pred,height_pred,id
0,1121669203426123-1754512169-2022-11-23-16-37-0...,MKGAN3671969,BlueCrossBlueShield,BSNCA,0,0,12.0,91.0,50.0,9.0,...,16,True,False,0.999,0.151,12.48,90.87,49.39,9.14,0
1,5331638810196106-1557644079-2022-07-07-16-43-3...,8VR1CT0TN11,Medicare,MCWAL,2,261,8.0,123.0,75.0,20.0,...,259,True,False,1.000,0.249,8.10,122.48,74.47,20.31,1
2,7481632831067900-603413821-2021-09-28-16-11-07...,2FN9GK8EM20,Medicare,MCCAL,2,258,7.0,150.0,89.0,19.0,...,259,True,False,1.000,0.214,7.34,149.65,91.17,20.43,2
3,1931671276987083-1756808390-2022-12-17-16-36-2...,XED909519757,BlueCrossBlueShield,BSPEX,0,36,30.0,98.0,38.0,9.0,...,36,True,True,0.993,0.802,30.56,97.37,38.02,9.41,3
4,2021652453963538-1699660572-2022-08-26-15-17-4...,3CK2MN5NX05,Medicare,MCFLL,2,260,16.0,120.0,81.0,15.0,...,260,True,True,1.000,0.454,16.19,119.55,81.10,16.05,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429804,2261654856409113-1340065617-2022-06-10-14-20-0...,QMF921353636,BlueCrossBlueShield,BCPRM,0,30,10.0,74.0,46.0,11.0,...,18,True,False,1.000,0.774,10.97,74.24,44.87,10.32,429804
429805,9821464959446967-100721206-2020-04-30-13-55-56...,2VA5VX9GY70,Medicare,MCNJL,2,259,14.0,137.0,54.0,11.0,...,259,True,True,1.000,0.357,13.36,135.99,54.51,11.21,429805
429806,9351573472089879-807079547-2022-02-11-17-37-29...,6NJ6QX0TQ60,Medicare,MCNC,2,262,9.0,132.0,82.0,21.0,...,262,True,True,1.000,0.223,9.17,131.99,81.81,20.96,429806
429807,7131682354964268-1787187069-2023-06-30-19-48-4...,2MJ7NR5MA36,Medicare,MCTXL,2,263,15.0,140.0,84.0,24.0,...,259,True,False,1.000,0.322,14.42,140.16,85.34,19.29,429807


In [2]:
fifty_datasets = fo.list_datasets()
for dataset in fifty_datasets:
    ds = fo.load_dataset(dataset)
    print(f"Deleting dataset {dataset}")
    ds.delete()

Deleting dataset 2103M_Model_CL_5_0.0002_CR_0.5_BBLR_1.8e-05_B16_testrun4_Best_Model_1


In [11]:
ds_filtered_data_2e4_Best_coco = convert_to_fityone_dataset(filtered_data_2e4_Best_coco)
print(ds_filtered_data_2e4_Best_coco.values("filepath")[0])

/Users/leopimentel/developer/ML/InsuranceTechV2.5/1300m_cl_images/1121669203426123-1754512169-2022-11-23-16-37-06-2bba16da-8f1f-432e-9c53-00db819d68a5-primaryInsuranceFrontImage.jpg


In [12]:
# filtered_data_2e4_Best_coco.name = "2103M_Model_CL_5_0.0002_CR_0.5_BBLR_1.8e-05_B16_testrun4_Best_Model_1"
dataset_name = str(filtered_data_2e4_Best_coco.name)
dataset = fo.load_dataset(dataset_name)
# Launch the App in a dedicated browser tab
session = fo.launch_app(dataset, auto=False)
session.open_tab()

Session launched. Run `session.show()` to open the App in a cell output.


<IPython.core.display.Javascript object>

In [None]:
pred_confs_flat = dataset.take(10).values("pred_bbox.detections.bounding_box")[0]
pred_confs_flat

<h2>Bounding Box for 2e4 Best</h2>


In [None]:
filtered_data_2e4_Best_bbox = filtered_data_2e4_Best.copy()
filtered_data_2e4_Best_bbox = compare_bounding_boxes(filtered_data_2e4_Best_bbox, op=operator.gt, threshold=1)
print(f"LR 2e-4_Best_Model - {filtered_data_2e4_Best_bbox.shape[0]:,d}, w/ bbox errors > 1")
filtered_data_2e4_Best_bbox.head(2)

In [None]:
cl_infer_2e4 = pd.read_csv(
    "CL_INFER_2103M_Model_CL_5_ONE_OPT_IOU_FIX_NO_WARM_ADAM_LR_0.0002_CR_0.5_BBLR_1.8e-05_B16_testrun4_Last_Model_1.csv"
)
filtered_data_2e4_Last = compare_provider_and_payor(gt_file, cl_infer_2e4)
filtered_data_2e4_Last_bbox = filtered_data_2e4_Last.copy()
filtered_data_2e4_Last.head(2)

In [None]:
filtered_data_2e4_Last_bbox = compare_bounding_boxes(filtered_data_2e4_Last_bbox, operator.gt, 0)
print(f"LR 2e-4_Last_Model - {filtered_data_2e4_Last_bbox.shape[0]:,d} w/ bbox errors > 1")
filtered_data_2e4_Last_bbox.head(2)

In [None]:
cl_infer_2e4 = pd.read_csv(
    "CL_INFER_2103M_Model_CL_6_ONE_OPT_IOU_FIX_WARM_L2_REG_ADAM_LR_0.0001_CR_0.5_B16_testrun4_Best_Model_1.csv"
)
filtered_data_1e4_Best = compare_provider_and_payor(gt_file, cl_infer_2e4)
filtered_data_1e4_Best.head(2)

In [None]:
filtered_data_1e4_Best_bbox = filtered_data_1e4_Best.copy()
filtered_data_1e4_Best_bbox = compare_bounding_boxes(filtered_data_1e4_Best_bbox, operator.gt, 1)
print(f"LR 1e-4_Best_Model - {filtered_data_1e4_Best_bbox.shape[0]:,d} w/ bbox errors > 1")
filtered_data_1e4_Best_bbox.head(2)

In [None]:
cl_infer_2e4 = pd.read_csv(
    "CL_INFER_2103M_Model_CL_6_ONE_OPT_IOU_FIX_WARM_L2_REG_ADAM_LR_0.0001_CR_0.5_B16_testrun4_Last_Model_1.csv"
)
filtered_data_1e4_Last = compare_provider_and_payor(gt_file, cl_infer_2e4)
filtered_data_1e4_Last.head(2)

In [None]:
filtered_data_1e4_Last_bbox = filtered_data_1e4_Last.copy()
filtered_data_1e4_Last_bbox = compare_bounding_boxes(filtered_data_1e4_Last_bbox, operator.gt, 1)
print(f"LR 1e-4_Last_Model - {filtered_data_1e4_Last_bbox.shape[0]:,d} bbox errors > 1")
filtered_data_1e4_Last_bbox.head(2)

In [None]:
cl_infer_2e4 = pd.read_csv("CL_INFER_model_7_Last_Model_1.csv")
filtered_data_Model_7 = compare_provider_and_payor(gt_file, cl_infer_2e4)
filtered_data_Model_7.head(2)

In [None]:
filtered_data_Model_7_bbox = filtered_data_Model_7.copy()
filtered_data_Model_7_bbox = compare_bounding_boxes(filtered_data_Model_7_bbox, operator.gt, 1)
print(f"Model 7 - {filtered_data_Model_7_bbox.shape[0]:,d} bbox errors > 1")
filtered_data_Model_7_bbox.head(2)

In [None]:
# 3 - way comparison
import pandas as pd

model_20_comp = pd.read_csv("./comps/cl-infer-model-comparison-DT-2024-04-18-19-14-44-452271.csv", engine="pyarrow")
train_data = pd.read_csv("training_concat_payor_parent.csv")
print(f"Training set - {train_data.shape[0]:,d} samples")
test_data = pd.read_csv("test_concat_payor_parent.csv")
print(f"Test set - {test_data.shape[0]:,d} samples")
full_data = pd.concat([train_data, test_data], axis=0)
full_data

In [None]:
# print(train_data.info())
# print(train_data.describe())
# print(train_data.dtypes)
# print(test_data.info())
# print(test_data.describe())
# print(test_data.dtypes)
# print(full_data.dtypes)
# print(model_20_comp.describe(include='all'))
model_20_comp

In [None]:
model_20_bbox = compare_provider_and_payor(full_data, model_20_comp, left_key="filename", right_key="pm_20_s3Key")
model_20_bbox = model_20_bbox.dropna(subset=["s3Key"])
# model_20_bbox.info()
model_20_bbox = create_diff_columns(model_20_bbox)

model_20_bbox

In [None]:
model_20_bbox = compare_bounding_boxes(model_20_bbox, operator.gt, -1)
# print(model_20_bbox.info())
model_20_bbox.head(2)

In [None]:
import albumentations as A
import cv2

# Define the augmentation pipeline
transform = A.Compose(
    [
        # A.HorizontalFlip(p=0.5),
        # A.RandomRotate90(p=0.5),
        # A.VerticalFlip(p=0.5),
        # A.ShiftScaleRotate(shift_limit=0.00625, scale_limit=0.1, rotate_limit=45, p=1),
        # A.MotionBlur(blur_limit=(3,7), p=1),
        # A.FancyPCA(alpha=0.5, p=1),
        # A.Emboss(p=1),
        # A.RandomBrightnessContrast(p=0.5),
        # Spatial Augmentation
        A.Perspective(p=1),
        # Add more transformations as needed
    ]
)

In [None]:
import os
from skimage.io import imread, imsave
from tqdm import tqdm

input_folder = "/Users/leopimentel/developer/ML/InsuranceTechV2.5/1300m_cl_images"
output_folder = "/Users/leopimentel/developer/ML/InsuranceTechV2.5/1300m_cl_images_augmented"
images = [
    "9641554225452170-835813674-2022-03-10-15-40-00-a32f6afc-2dff-4860-9976-56d849d8a547-primaryInsuranceFrontImage.jpg"
]
max_image_count = 2
# Loop over all images in the weaker classes
# for class_id in weaker_classes:  # Assume you have a list of weaker class IDs
# class_folder = os.path.join(input_folder, str(class_id))
augmented_images_count = 0
# images = os.listdir(class_folder)

while augmented_images_count < max_image_count:
    for image_name in images:
        image_path = os.path.join(input_folder, image_name)
        image = imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # Convert to BGR format which OpenCV uses
        augmented_image = transform(image=image)["image"]
        # Save the augmented image
        new_image_name = f"{image_name.split('.')[0]}_aug_{augmented_images_count}.jpg"
        new_image_path = os.path.join(output_folder, new_image_name)
        imsave(new_image_path, augmented_image)
        augmented_images_count += 1
        if augmented_images_count >= max_image_count:
            break

In [None]:
import boto3
import os
import math
import pandas as pd

TEST = False


session = boto3.Session(profile_name="dev21")
s3_client = session.client("s3")

s3_resource = boto3.resource("s3")
bucket = s3_resource.Bucket("my-bucket")

# list of payors
# payor_to_get = ["MUTOM", "AARPA", "CMGHU", "CAHBM"]

# filtered_df = (
#     model_comp3[model_comp3["PROD01_payorCode"].isin(payor_to_get)]
#     if TEST
#     else cl_data2
# )
# display(filtered_df)

# category_values = (
#     filtered_df.PROD01_payorCode.unique() if TEST else cl_data2.payor_cd.unique()
# )
max_iterations = math.inf
i = 0

home_path = "/Users/leopimentel/developer/ML/InsuranceTechV2.5"
image_home = f"{home_path}/1300m_cl_images/"

s3_target = None
# category_df = pd.read_csv(f"{home_path}/data/cl/12-Mar-2024-dataset.csv").query(
#     'payor_cd == "MA"'
category_df = pd.read_csv(f"{home_path}/data/cl/03-Apr-2024-dataset.csv")

num_of_rows = category_df.shape[0]
# s3 parameters
if TEST:
    s3_target = "insurancevision-deployme-insurancevisionsourcebuc-1qm4c1dhax0y4"
    s3_target_path = "resize/"
else:
    s3_target = "continuouslearningimagesdev"
    s3_target_path = "insurance_vision/224_images/"


error_lst = []

# for cat_val in category_values:
#     if TEST:
#         category_df = filtered_df[filtered_df.PROD01_payorCode == cat_val]
#     else:
#         category_df = filtered_df[filtered_df.payor_cd == cat_val]

for image in category_df["key"] if TEST else category_df["filename"]:
    if i < max_iterations:
        s3Key = s3_target_path + image
        # print(s3_target + s3Key)
        local_file = image_home + image
        # Check if key exists in bucket
        try:
            response = s3_client.head_object(Bucket=s3_target, Key=s3Key)
        except Exception as e:
            if e.response["Error"]["Code"] == "404":
                print(f"Object '{s3Key}' does not exist in '{s3_target}'.")
                error_lst.append(s3Key)
                continue
            else:
                print(f"An error occurred: {e}")
        if not os.path.exists(local_file):
            s3_client.download_file(s3_target, s3Key, local_file)
            # print(local_file)
            if not os.path.exists(local_file):
                raise FileNotFoundError(f"File does not exist. {local_file}")
            else:
                print(f"File downloaded: {image}")
        print(f"file_no: {i:,}/{num_of_rows:,}")
        i += 1
    else:
        break

df_err = pd.Series(error_lst)
df_err.to_csv("error_s3_log.csv", index=False)