# Script to Generate ML Annotations

Author: Nardiena A. Pratama

## Install and Import Libraries

In [None]:
!pip3 install opencv-python
!sudo apt-get update && sudo apt-get install ffmpeg libsm6 libxext6  -y

In [None]:
from helper_scripts.annotation_generation_func import *
from helper_scripts.utility_functions import *
from PIL import Image, ImageFont
import torchvision.transforms as transforms
import re
import sys
import os
import boto3
import pandas as pd
from io import StringIO, BytesIO

## Set Parameters

In [None]:
# Get the current notebook directory
notebook_dir = os.path.dirname(os.path.abspath(''))

# Construct the path to folder1
folder1_path = os.path.abspath(os.path.join(notebook_dir, '..', '..', 'winter-research-2023'))

# Add folder1 to the system path
sys.path.append(folder1_path)

In [None]:
## Confidence level for object detection model
CONF_LEVEL_DEC= 0.5
CONF_LEVEL = int(CONF_LEVEL_DEC *100)

In [None]:
skip_code = True # set to False when wanting to run model inference in this notebook
eval_obj_det_model = True  # set to True when wanting to run the object detection model
eval_img_capt_model =  True  # set to True when wanting to run the image captioning model
output_dir = f"/data/outputs_{CONF_LEVEL}"
print(output_dir)

## Set AWS Credentials

Do not put quotation marks around the value.

In [None]:
%env BUCKET_NAME=aws_bucket_name
%env S3_OUTPUT_PREFIX=link_to_s3_bucket_directory

## Connect to AWS 

In [None]:
# Create a session using the default credentials (IAM role attached to the instance)
session = boto3.Session()

# Create an S3 client
s3 = session.client('s3')

# Specify your bucket name and folder path
bucket_name = os.getenv('BUCKET_NAME')
folder_path = '/data/'

# Initialize variables for pagination
paginator = s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_path)

# List to store all CSV file keys
csv_files = []

# Iterate through each page of results
for page in page_iterator:
    for obj in page.get('Contents', []):
        key = obj['Key']
        # Check if the key ends with '.csv' and is directly in the specified folder
        if key.endswith('.csv') and not key.count('/') != folder_path.count('/') and not key.endswith('resultswithgoodworkeronly.csv'):
            csv_files.append(key)
print(csv_files)


# ============================
# Read each CSV file into a pandas DataFrame and store in a list
categories_dataframes = {}
for file_key in csv_files:
    response = s3.get_object(Bucket=bucket_name, Key=file_key)
    csv_content = response['Body'].read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_content))
    df_key = file_key.split(".csv")[0].split(folder_path)[-1]
    categories_dataframes[df_key] = df

# Display the combined DataFrame
print(categories_dataframes.keys())

In [None]:
CATEGORIES = list(categories_dataframes.keys())
CATEGORIES

## Check images in folder and Image IDs from CSVs

In [None]:
for cat in CATEGORIES:
    df_count = len(categories_dataframes[cat])
    count = 0

    # Specify your bucket name and folder path
    folder_path = f'/data/{cat}/'
    
    # Initialize variables for pagination
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_path)


    # Iterate through each page of results
    for page in page_iterator:
        for obj in page.get('Contents', []):
            if obj['Key'].endswith('.jpg'):
                count += 1
                
    if df_count == count:
        print(f"SUCCESS: {cat} DOES HAVE equal number of image IDs and existing images")
    else:
        print(f"WARNING: {cat} DOES NOT HAVE equal number of image IDs and existing images")

## Retrieve Models

In [None]:
args = {
    "labels": '../data/coco-labels-paper.txt',
    "model":'frcnn-resnet',
    "confidence": CONF_LEVEL,
}
obj_det_model_dict = getMLModel(model_type="object-detection", args=args)

img_capt_model_dict = getMLModel(model_type="image-captioning")


# CAUTION! 

#### Running the code below will result in ML labels that may contain different results as the ML models are non-deterministic. This will alter the results in the subsequent Jupyter notebooks.

#### Skip the rest of the cells below and move on to the next Jupyter notebook to avoid this.

In [None]:
%%time

if not skip_code:
    for category in CATEGORIES:
        obj_det_labels = {}
        img_capt_labels = {}
        obj_det_labels[category] = {}
        img_capt_labels[category] = {}

         # Specify your bucket name and folder path
        folder_path = f'/data/{category}/'
        
        # Initialize variables for pagination
        paginator = s3.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_path)
    
        annotations_output_dir = f"{output_dir}/annotated-{category}/"
        if not check_s3_path_exists(bucket_name, annotations_output_dir):
            print(f"The directory {annotations_output_dir} does not exist in the bucket {bucket_name}. Creating directory...")
            s3.put_object(Bucket=bucket_name, Key=annotations_output_dir)
            
        # Iterate through each page of results
        for page in page_iterator:
            for obj in page.get('Contents', []):
                if obj['Key'].endswith('.jpg'):
                    img_id = (obj['Key'].split('.jpg')[0]).split("/")[-1] # (file.split('_')[-1].split('.')[0])
                    file_path = obj['Key']
                    
                    try:

                        # Download the image from S3
                        response = s3.get_object(Bucket=bucket_name, Key=file_path)
                        image_data = response['Body'].read()
                        
                        # Read the image using PIL
                        raw_image = Image.open(BytesIO(image_data)).convert('RGB')
                    
                        if eval_obj_det_model:
                            WEIGHTS = obj_det_model_dict["weights"]
                            model = obj_det_model_dict["model"]
                            DEVICE = obj_det_model_dict["device"]
                            args = obj_det_model_dict["args"]
                            
                            # Initialise inference transforms
                            preprocess = WEIGHTS.transforms()

                            transform = transforms.Compose([
                                            transforms.ToTensor(),  # Convert PIL image to tensor
                                            transforms.ConvertImageDtype(torch.uint8)  # Ensure the tensor is in uint8 format
                                        ])
                            image_tensor = transform(raw_image)
                            img = image_tensor.to(DEVICE)
                            # img = (img * 255).byte() # make sure tensor is in uint8 format
                            
                            # Apply inference preprocessing transforms
                            batch = [preprocess(img)]
                            model.eval()
                            prediction = model(batch)[0]

                            color_blind_palette = [
                                "#FF3D00",  # neon red
                                "#FF6F00",  # neon orange
                                "#FFD600",  # neon yellow
                                "#00E676",  # neon green
                                "#00B0FF",  # neon blue
                                "#D5006D",  # neon pink
                                "#FF4081",  # neon magenta
                                "#F50057",  # neon rose
                                "#FF5252",  # neon coral
                                "#76FF03",  # neon lime
                                "#1B5E20",  # neon dark green
                                "#5E57FF",  # neon purple
                                "#02FEE4"   # neon cyan
                            ]
                            
                            # Create a color map using the color-blind friendly palette
                            class_color_map = {
                                int(class_id): color_blind_palette[int(class_id) % len(color_blind_palette)]
                                for class_id in set(prediction["labels"])
                            }
                            
                            labels = []
                            img_labels = []
                            img_boxes = []
                            for pred_label, pred_conf, pred_box in zip(prediction["labels"], prediction["scores"], prediction["boxes"]):
                                if pred_conf > args["confidence"]:
                                    img_labels.append("{}: {:.2f}%".format(WEIGHTS.meta["categories"][pred_label], pred_conf * 100))
                                    img_boxes.append(pred_box)
                                    labels.append(WEIGHTS.meta["categories"][pred_label])

                            stacked_boxes = torch.stack(img_boxes)
                            
                            labels_str = ','.join(set(labels))
                            obj_det_labels[category][img_id] = labels_str
                            # font_type = ImageFont.truetype("winter-research-2023/Arial.ttf", 50, encoding="unic")
                            font_path = '/tmp/Arial.ttf'
                            # font_type = ImageFont.truetype(font_path, 50)

                            box = draw_bounding_boxes(img, boxes=stacked_boxes,
                                                    labels=img_labels,
                                                    colors=[class_color_map[int(class_id)] for class_id in prediction["labels"]],
                                                    width=4, font_size=50, font=font_path)
                            im = to_pil_image(box.detach())
                            # im.show()   
                            # im.save(f"{annotations_output_dir}/annotated_{img_id}.jpg")
                            s3_file_path = f"{annotations_output_dir}annotated_{img_id}.jpg"
                            upload_image_to_s3(bucket_name, s3_file_path, im)
                        if eval_img_capt_model:
                            processor = img_capt_model_dict["processor"]
                            model = img_capt_model_dict["model"]
                            inputs = processor(raw_image, return_tensors="pt")
                            out = model.generate(**inputs, max_new_tokens=512)
                            caption_str = processor.decode(out[0], skip_special_tokens=True)
                            img_capt_labels[category][img_id] = caption_str

                    except Exception as err:
                        print(f"error - image {file_path} is corrupted! Skipping image...")
                        print(f"Error: {err}")
                        # continue

        if obj_det_labels:
            s3_pickle_path = f"{output_dir}/{category}_ml_labels_dict_{datetime.now().strftime('%m-%d-%Y')}.pickle"
            upload_pickle_to_s3(bucket_name, s3_pickle_path, obj_det_labels)

        if img_capt_labels:
            s3_pickle_path = f"{output_dir}/{category}_ml_captions_dict_{datetime.now().strftime('%m-%d-%Y')}.pickle"
            upload_pickle_to_s3(bucket_name, s3_pickle_path, img_capt_labels)
            

In [None]:
print(output_dir)
print(CONF_LEVEL)

In [None]:
missing_ids = dict()
id_count = dict()
for curr_category in CATEGORIES:
    id_count[curr_category] = dict()
    
    # 50%
    CONF_LEVEL = 50
    output_dir = f'/data/outputs_{CONF_LEVEL}'
    file_key_labels = f'{output_dir}/{curr_category}_ml_labels_dict_12-09-2024.pickle'
    file_key_captions = f'{output_dir}/{curr_category}_ml_captions_dict_12-09-2024.pickle'
    
    
    caption_results = read_pickle_from_s3(s3, bucket_name, file_key_labels)
    label_results = read_pickle_from_s3(s3, bucket_name, file_key_captions)

    df = categories_dataframes[curr_category]
    
    df['id'] = df['id'].astype(str).str.strip().str.lower().apply(lambda x: re.sub(r'\s+', '', x))
    unique_all_ids = set(df['id'].tolist())

    
    # Clean and standardize IDs in caption_results
    unique_caption_ids = set(map(lambda x: re.sub(r'\s+', '', str(x).strip().lower()), caption_results[curr_category].keys()))

    unique_label_ids = set(map(lambda x: re.sub(r'\s+', '', str(x).strip().lower()), label_results[curr_category].keys()))

    intersection_unique_ml_ids = unique_caption_ids.intersection(unique_label_ids)

    missing_ids[curr_category] = list(unique_all_ids - intersection_unique_ml_ids)

    assert len(missing_ids[curr_category]) == len(unique_all_ids) - len(intersection_unique_ml_ids)
    id_count[curr_category]["before"] = len(unique_all_ids)
    id_count[curr_category]["after"] = len(intersection_unique_ml_ids)


In [None]:
missing_ids_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in missing_ids.items()]))
missing_ids_df

In [None]:
csv_buffer = StringIO()
missing_ids_df.to_csv(csv_buffer, index=False)

s3_file_path = f'{output_dir}/missing_ids_all_{CONF_LEVEL}.csv'

# Upload the CSV to S3
if not skip_code:
    s3.put_object(Bucket=bucket_name, Key=s3_file_path, Body=csv_buffer.getvalue())

print(f"DataFrame saved as CSV and uploaded to {output_dir} successfully.")

In [None]:
id_count_df = pd.DataFrame(id_count).T
id_count_df

In [None]:
csv_buffer = StringIO()
id_count_df.to_csv(csv_buffer, index=True)

s3_file_path = f'{output_dir}/id_count_{CONF_LEVEL}.csv'

# Upload the CSV to S3
if not skip_code:
    s3.put_object(Bucket=bucket_name, Key=s3_file_path, Body=csv_buffer.getvalue())

print(f"DataFrame saved as CSV and uploaded to {output_dir} successfully.")

# Process ML Labels 

In [None]:
def q25(x):
    return x.quantile(0.25)

def q75(x):
    return x.quantile(0.75)

# Define income groups
def income_group(row):
    if row['income'] >= row['min'] and row['income'] < row['q25']:
        return 'min-q25'
    elif row['income'] >= row['q25'] and row['income'] < row['median']:
        return 'q25-median'
    elif row['income'] >= row['median'] and row['income'] < row['q75']:
        return 'median-q75'
    else:
        return 'q75-max'

In [None]:
def calc_quantiles_get_counts(df, curr_category):
    # Calculate quantiles for each region
    grouped = df.groupby('region')['income'].agg(['min', q25, 'median', q75, 'max']).reset_index()
    
    # Merge the quantiles back to the original DataFrame
    merged_df = pd.merge(df, grouped, on='region', suffixes=('', '_quantile'))
    
    
    
    merged_df['income_group'] = merged_df.apply(income_group, axis=1)
    
    # Count the number of rows that fulfill each region and income group
    group_counts = merged_df.groupby(['region', 'income_group']).size().reset_index(name='count')
    print(f"Count of samples per region and income level for {curr_category.capitalize()} category...")
    return group_counts, merged_df

## Check Number of Rows Grouped by Region and Income Group

In [None]:
filtered_categories_dataframes = dict()
for curr_category in CATEGORIES:
    # if curr_category == 'sleeping':
    df = categories_dataframes[curr_category]
    
    filtered_df = df[~df['id'].isin(missing_ids[curr_category])].copy()
    filtered_categories_dataframes[curr_category] = filtered_df.copy()

    group_counts = calc_quantiles_get_counts(filtered_df, curr_category)[0]
    print(group_counts)
    

## Get Unique Image IDs From Every Category

In [None]:
dw = set(filtered_categories_dataframes['drinking-water']['id'].tolist())
print(len(dw))
dc = set(filtered_categories_dataframes['drying-clothes']['id'].tolist())
print(len(dc))
fd = set(filtered_categories_dataframes['front-doors']['id'].tolist())
print(len(fd))
hw = set(filtered_categories_dataframes['hand-washing']['id'].tolist())
print(len(hw))
kc = set(filtered_categories_dataframes['kitchens']['id'].tolist())
print(len(kc))
lr = set(filtered_categories_dataframes['living-rooms']['id'].tolist())
print(len(lr))
pfd = set(filtered_categories_dataframes['places-for-dinner']['id'].tolist())
print(len(pfd))
wc = set(filtered_categories_dataframes['washing-clothes']['id'].tolist())
print(len(wc))

## Find Image IDs that Exist in More Than One Category

In [None]:
sets_list = [dw,dc,fd, hw, kc, lr, pfd, wc]
sets = {'dw': dw, 'dc' :dc, 'fd':fd,'hw':hw, 'kc':kc, 'lr':lr,'pfd':pfd, 'wc':wc}

# Check for overlapping IDs
all_ids = {}
overlapping_ids = set()
overlapping_locations = {}

for set_name, s in sets.items():
    for id in s:
        if id in all_ids:
            overlapping_ids.add(id)
            if id in overlapping_locations:
                overlapping_locations[id].append(set_name)
            else:
                overlapping_locations[id] = [all_ids[id], set_name]
        else:
            all_ids[id] = set_name

# Create a dictionary where the key is the overlapping ID and the value is a list of sets it exists in
overlapping_dict = {id: overlapping_locations[id] for id in overlapping_ids}

# Print the result
if overlapping_ids:
    print(f"Overlapping IDs found: {overlapping_ids}")
    for id in overlapping_ids:
        print(f"ID {id} found in sets: {overlapping_locations[id]}")
else:
    print("No overlapping IDs found. All IDs are unique across the sets.")

print(overlapping_dict)

## Convert Category Abbreviations to Original Category Names

In [None]:
set_name_mapping = {
    'dw': 'drinking-water',
    'dc': 'drying-clothes',
    'fd': 'front-doors',
    'hw': 'hand-washing',
    'kc': 'kitchens',
    'lr': 'living-rooms',
    'pfd': 'places-for-dinner',
    'sl': 'sleeping',
    'wc': 'washing-clothes'
}

for key, value in overlapping_dict.items():
    overlapping_dict[key] = [set_name_mapping[abbr] for abbr in value]

print(overlapping_dict)

## Remove Overlapping Image IDs From One of the Categories (the set with fewer overall samples/the set with the smallest region total)

In [None]:
cols_to_drop = {('living-rooms', 'places-for-dinner'): 'places-for-dinner',
                ('drying-clothes', 'washing-clothes'): 'washing-clothes',
                ('hand-washing', 'washing-clothes'): 'hand-washing',
                ('drinking-water', 'hand-washing'): 'hand-washing',
                ('kitchens', 'places-for-dinner'): 'places-for-dinner'}

print(len(filtered_categories_dataframes['places-for-dinner']))
no_dups_dfs = filtered_categories_dataframes.copy()
print(len(no_dups_dfs['places-for-dinner']))
for image_id, value in overlapping_dict.items():
    if (value[0], value[1]) in cols_to_drop:
        drop_col = cols_to_drop[(value[0], value[1])]
        no_dups_dfs[drop_col] = no_dups_dfs[drop_col].drop(no_dups_dfs[drop_col][no_dups_dfs[drop_col]['id'] == image_id].index, inplace=False)

print(len(filtered_categories_dataframes['places-for-dinner']))
print(len(no_dups_dfs['places-for-dinner']))


## Print Out Total Samples Per Category

In [None]:
for df_name, each in filtered_categories_dataframes.items():
    print(f"{df_name}: {len(each)}")
print("======================== AFTER REMOVING DUPLICATES ========================")
for df_name, each in no_dups_dfs.items():
    print(f"{df_name}: {len(each)}")


## Check Total Samples By Region and Income and Downsample

In [None]:
downsampled_categories_df = dict()
for curr_category in CATEGORIES:
    df = no_dups_dfs[curr_category]
    
    group_counts, merged_df_curr = calc_quantiles_get_counts(df, curr_category)
    print(group_counts)
    smallest_num_sample = group_counts.groupby('income_group')['count'].min().min()
    print(f"Downsampling to {smallest_num_sample}...")
    temp_downsampled_df = merged_df_curr.groupby(['region', 'income_group'], as_index=False).apply(lambda x: x.sample(n=smallest_num_sample, replace=False), include_groups=False).reset_index(drop=True)
    downsampled_categories_df[curr_category] = temp_downsampled_df.merge(no_dups_dfs[curr_category], 
                                                                         how="inner").drop(['min', 'q25', 
                                                                                            'median', 'q75', 'max'], axis=1)
    assert len(temp_downsampled_df) == len(downsampled_categories_df[curr_category]), f"Number of rows in downsampled DF ({len(temp_downsampled_df)}) does not match number of rows in merged downsampled DF ({len(downsampled_categories_df[curr_category])})!"
    

## Print out Total Samples Per Category (Again)

In [None]:
for df_name, each in filtered_categories_dataframes.items():
    print(f"{df_name}: {len(each)}")
print("======================== AFTER REMOVING DUPLICATES ========================")
for df_name, each in no_dups_dfs.items():
    print(f"{df_name}: {len(each)}")
print("======================== AFTER DOWNSAMPLING ========================")
for df_name, each in downsampled_categories_df.items():
    print(f"{df_name}: {len(each)}")

## Save Downsampled Dataframes into S3

In [None]:
output_dir

In [None]:
for curr_category in CATEGORIES:

    print(curr_category)   
    curr_df = downsampled_categories_df[curr_category].copy()

    csv_buffer = StringIO()
    curr_df.to_csv(csv_buffer, index=False)
    
    s3_file_path = f'{output_dir}/downsampled_df_{CONF_LEVEL}_{curr_category}.csv'
    
    # Upload the CSV to S3
    if not skip_code:
        s3.put_object(Bucket=bucket_name, Key=s3_file_path, Body=csv_buffer.getvalue())
    
    print(f"DataFrame saved as CSV and uploaded to {s3_file_path} successfully.")

In [None]:
dw = set(downsampled_categories_df['drinking-water']['id'].tolist())
print(len(dw))
dc = set(downsampled_categories_df['drying-clothes']['id'].tolist())
print(len(dc))
fd = set(downsampled_categories_df['front-doors']['id'].tolist())
print(len(fd))
hw = set(downsampled_categories_df['hand-washing']['id'].tolist())
print(len(hw))
kc = set(downsampled_categories_df['kitchens']['id'].tolist())
print(len(kc))
lr = set(downsampled_categories_df['living-rooms']['id'].tolist())
print(len(lr))
pfd = set(downsampled_categories_df['places-for-dinner']['id'].tolist())
print(len(pfd))
wc = set(downsampled_categories_df['washing-clothes']['id'].tolist())
print(len(wc))

In [None]:
if len(set(dw.union(dc).union(fd).union(hw).union(kc).union(lr).union(pfd).union(wc))) == (len(dw)+ len(dc)+len(fd)+len(hw)+len(kc)+len(lr)+len(pfd)+len(wc) ):
    print("SUCCESS: No duplicates found in final dataframe!")
else:
    print("FAIL: Duplicates found in final dataframe! Make sure that all cells are not run more than once as it can overwrite variables.")

## Add S3 Image Links to DF

In [None]:
def add_image_s3_link(row, curr_category):
    url_prefix = os.getenv('S3_OUTPUT_PREFIX')
    return f"{url_prefix}//data/{curr_category}/{row['id']}.jpg"

for curr_category in CATEGORIES:

    print(curr_category)   
    curr_df = downsampled_categories_df[curr_category].copy()

    curr_df['image_s3_link'] = curr_df.apply(lambda row: add_image_s3_link(row, curr_category), axis=1)
    
    

    csv_buffer = StringIO()
    curr_df.to_csv(csv_buffer, index=False)
    
    s3_file_path = f'{output_dir}/downsampled_df_with_imagelink_{CONF_LEVEL}_{curr_category}.csv'
    print(s3_file_path)
                
    # Upload the CSV to S3
    if not skip_code:
        s3.put_object(Bucket=bucket_name, Key=s3_file_path, Body=csv_buffer.getvalue())

    print(f"DataFrame with image s3 links saved as CSV and uploaded to {s3_file_path} successfully.")
