# Script to Preprocess Annotations

Author: Nardiena A. Pratama

In [None]:
!pip install wordsegment autocorrect 

!pip3 install opencv-python
!sudo apt-get update && sudo apt-get install ffmpeg libsm6 libxext6  -y

In [None]:
!pip install spacy==3.8.0

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
import pandas as pd
import boto3
import pandas as pd
from io import StringIO



from helper_scripts.preprocess import *
from helper_scripts.utility_functions import *


## Set AWS Credentials

Do not put quotation marks around the value.

In [None]:
%env BUCKET_NAME=aws_bucket_name

## Connect to AWS

In [None]:
# Create a session using the default credentials (IAM role attached to the instance)
session = boto3.Session()

# Create an S3 client
s3 = session.client('s3')

# Specify your bucket name and folder path
bucket_name = os.getenv('BUCKET_NAME')


## Read files containing human labels

In [None]:
key = '/data/resultswithgoodworkeronly.csv'
response = s3.get_object(Bucket=bucket_name, Key=key)
csv_content = response['Body'].read().decode('utf-8')
human_df = pd.read_csv(StringIO(csv_content))

In [None]:
human_df.head()

## Extract Categories and Video IDs

In [None]:
def extract_cat_VID(row):
    result = row['Input.image_url'].split(".jpg")[0].split("/")[-2:]
    category, VID = result[0], result[1]
    return category, VID

In [None]:
human_df['category'] = human_df.apply(lambda row: extract_cat_VID(row)[0], axis=1)
human_df['VID'] = human_df.apply(lambda row: extract_cat_VID(row)[1], axis=1)
human_df

In [None]:
human_df.columns

## Calculate Number of Images Per Category

In [None]:
cat_dict = dict()
cat_set = set()
for i, j in human_df.iterrows():
    category, image_id = (j['Input.image_url'].split(".jpg")[0].split("/")[-2::])
    cat_img_id = category + "/" + image_id
    if category not in cat_dict:
        cat_dict[category] = 1
    else:
        if cat_img_id in cat_set:
            continue
        else:
            cat_dict[category] += 1
    cat_set.add(cat_img_id)


In [None]:
CATEGORIES = human_df['category'].unique()
CATEGORIES

In [None]:
human_df.groupby("VID")

In [None]:
# human_df_cleaned = human_df.groupby("VID").apply(
#     lambda group: group[~group.duplicated(subset=columns_of_interest, keep="first")]
# ).reset_index(drop=True)
# human_df_cleaned

## Calculate Average Num of Tokens/Words per Tag and Average Num of Characters per Tag

Each worker gives 5-10 tags (words/phrases). We are calculating the average num of tokens/words for each tag.

In [None]:
# Count total words and non-empty tags
total_words = 0
total_tags = 0

columns_of_interest = [col for col in human_df.columns if "Answer.tag" in col]
print(columns_of_interest)

# Fill NaN values with empty strings
human_df[columns_of_interest] = human_df[columns_of_interest].fillna("")


# Calculate words per row
def calculate_total_words_in_row(row):
    # Count non-empty cells in the row (tags)
    non_empty_cells = [cell for cell in row if cell.strip() != ""]
    
    # Calculate total words in non-empty cells
    total_words = sum(len(cell.split()) for cell in non_empty_cells)
    
    return total_words


def replace_duplicates_with_empty(group):
    # Keep track of seen tags for this group
    seen_tags = set()
    
    # Create a copy to modify
    modified_group = group.copy()
    
    # For each row in the group
    for idx in group.index:
        # Get current row's tags
        current_tags = group.loc[idx, columns_of_interest]
        
        # For each tag in current row
        for col in columns_of_interest:
            tag = current_tags[col]
            # If we've seen this tag before in this VID group, replace with empty string
            if tag in seen_tags:
                modified_group.loc[idx, col] = ""
            else:
                seen_tags.add(tag)
    
    return modified_group

# Apply the function to each VID group
human_df = human_df.groupby('VID').apply(replace_duplicates_with_empty).reset_index(drop=True)

# Apply the function to each row to compute row-level total words
human_df['total_words_in_row'] = human_df[columns_of_interest].apply(calculate_total_words_in_row, axis=1)

# Overall average words per row across all rows
average_words_per_row = human_df['total_words_in_row'].mean()

# Calculate total words and tags for overall stats
total_words = sum(len(cell.split()) for col in columns_of_interest for cell in human_df[col] if cell.strip() != "")
total_non_empty_tags = sum(1 for col in columns_of_interest for cell in human_df[col] if cell.strip() != "")
average_words_per_non_empty_tag = total_words / total_non_empty_tags if total_non_empty_tags > 0 else 0

# Calculate total characters and average characters per non-empty tag
total_characters = sum(len(cell) for col in columns_of_interest for cell in human_df[col] if cell.strip() != "")
average_characters_per_non_empty_tag = total_characters / total_non_empty_tags if total_non_empty_tags > 0 else 0

# Results
print(f"Average Words per Non-Empty Tag: {average_words_per_non_empty_tag:.2f}")
print(f"Average Characters per Non-Empty Tag: {average_characters_per_non_empty_tag:.2f}")
print(f"Average Words per Worker Entry: {average_words_per_row:.2f}")

## Preprocess human labels

- remove noise to get rid of punctuation
- make lower case to make consistent
- don't remove stop words, like "on", "in", may be valuable in showing spatial relationships in the image annotations, needed fo contextual nuance

- don't lemmatize, or do stemming because valuable context could be stripped away, i.e., "wash" and "washing" could have different meanings
- use autocorrect to fix typos (not perfect)
- segment words in case there is no space between words

In [None]:
run_preprocessing = False

In [None]:
%%time
assert run_preprocessing == True, "run_preprocessing is set to False! Setting it to true will run preprocessing functions, which can take up to 50 mins!"

human_preprocessed = preprocessing_human(human_df, 'human_labels', ['Answer.tag1', 'Answer.tag2', 'Answer.tag3',
       'Answer.tag4', 'Answer.tag5', 'Answer.tag6', 'Answer.tag7',
       'Answer.tag8', 'Answer.tag9', 'Answer.tag10'])

csv_buffer = StringIO()
human_preprocessed.to_csv(csv_buffer, index=False)


file_path = "/data/outputs_50/human_labels_preprocessed.csv"
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

print(f"DataFrame saved as CSV and uploaded to {file_path} successfully.")

Concatenate labels to one column

In [None]:
file_path = "/data/outputs_50/human_labels_preprocessed.csv"
response = s3.get_object(Bucket=bucket_name, Key=file_path)
csv_content = response['Body'].read().decode('utf-8')
human_preprocessed = pd.read_csv(StringIO(csv_content))
human_preprocessed['labels']

In [None]:
cat_dict = dict()
cat_set = set()
for i, j in human_preprocessed.iterrows():
    category, image_id = (j['Input.image_url'].split(".jpg")[0].split("/")[-2::])
    cat_img_id = category + "/" + image_id
    if category not in cat_dict:
        cat_dict[category] = 1
    else:
        if cat_img_id in cat_set:
            continue
        else:
            cat_dict[category] += 1
    cat_set.add(cat_img_id)
cat_dict

In [None]:
len(human_preprocessed['Input.image_url'].unique())

In [None]:
print(f"Unique IDs in Human Annotations: {len(set(human_preprocessed.VID))}")

### Group labels together for each ID 

In [None]:
human_preprocessed.drop(human_preprocessed[pd.isna(human_preprocessed['labels'])].index, inplace=True)
original_order = human_preprocessed.columns.tolist()

# Modify Human DF so that Each VID has Only One Row
grouped = human_preprocessed.groupby('VID').agg({
    'labels': lambda x: ','.join(x),
    **{col: 'first' for col in human_preprocessed.columns if col not in ['VID', 'labels']}
}).reset_index()
human_preprocessed = grouped[original_order]

human_preprocessed['labels'] = human_preprocessed.apply(lambda row: remove_extra_commas(row['labels']), axis=1)
print(f"Unique ids: {len(human_preprocessed.VID.unique())}")
# human_preprocessed.to_csv('data/outputs/human_filtered.csv', index=False)

csv_buffer = StringIO()
human_preprocessed.to_csv(csv_buffer, index=False)


file_path = "/data/outputs_50/human_labels_grouped_preprocessed.csv"
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

print(f"DataFrame saved as CSV and uploaded to {file_path} successfully.")


human_preprocessed

## Retrieve All Image Data Pertaining Region/Income

In [None]:
folder_path = '/data/outputs_50'
# Initialize variables for pagination
paginator = s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_path)

# List to store all CSV file keys
csv_files = []

# Iterate through each page of results
for page in page_iterator:
    for obj in page.get('Contents', []):
        key = obj['Key']
        # Check if the key ends with '.csv' and is directly in the specified folder
        if key.endswith('.csv')  and key.startswith(f'{folder_path}/downsampled') and any(map(key.__contains__, CATEGORIES)) and any(map(key.__contains__, ["imagelink"])):
            csv_files.append(key)
print(csv_files)


# ============================
# Read each CSV file into a pandas DataFrame and store in a list
downsampled_categories_dataframes = {}
for file_key in csv_files:
    response = s3.get_object(Bucket=bucket_name, Key=file_key)
    csv_content = response['Body'].read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_content))
    df_key = file_key.split(".csv")[0].split(folder_path)[-1].split("_")[-1]
    downsampled_categories_dataframes[df_key] = df


### Concatenate all dfs

In [None]:
downsampled_dataframes = pd.DataFrame()
for k, v in downsampled_categories_dataframes.items():
    downsampled_categories_dataframes[k]['topics'] = k
    downsampled_dataframes = pd.concat([downsampled_dataframes, v], axis=0)    # concatenating along rows

In [None]:
downsampled_dataframes.topics.value_counts()

In [None]:
human_preprocessed.category.value_counts()

In [None]:
set(downsampled_dataframes.id.unique())- set(human_preprocessed.VID.unique())

### Merge Human Labels DF with Downsampled DF

In [None]:
downsampled_human_df = pd.merge(downsampled_dataframes,human_preprocessed, how='inner', left_on=['id'], right_on=['VID'])
downsampled_human_df = downsampled_human_df[['id', 'category', 'images', 'labels', 'region', 'country', 'income']]
downsampled_human_df.rename(columns={'labels': 'human_labels'}, inplace=True)
downsampled_human_df

In [None]:
downsampled_human_df.category.value_counts()

## Preprocess ML Annotations

In [None]:
CATEGORIES

### Read ML Captions and Labels For Each Image, Preprocess, Read into DF, and Concatenate into One Dataframe

In [None]:
run_preprocessing = False

In [None]:
assert run_preprocessing == True, "run_preprocessing is set to False! Setting it to true will run preprocessing functions, which can take up to 50 mins!"

all_ml_df = pd.DataFrame()
CONF_LEVEL = 50
for curr_category in CATEGORIES:
# curr_category = 'drinking-water'
    print(curr_category)
    output_dir = f'/data/outputs_{CONF_LEVEL}'
    file_key_labels = f'{output_dir}/{curr_category}_ml_labels_dict_12-09-2024.pickle'
    file_key_captions = f'{output_dir}/{curr_category}_ml_captions_dict_12-09-2024.pickle'
    
    caption_results = read_pickle_from_s3(s3, bucket_name, file_key_labels)
    caption_results[curr_category] = preprocessing_ml_labels(caption_results[curr_category])

    temp_caption_df = pd.DataFrame(caption_results)
    temp_caption_df['category'] = curr_category
    temp_caption_df.reset_index(inplace=True)
    temp_caption_df.rename(columns={curr_category: "ml_captions", "index": "id"}, inplace=True)
    
    label_results = read_pickle_from_s3(s3, bucket_name, file_key_captions)
    label_results[curr_category] = preprocessing_ml_labels(label_results[curr_category])

    temp_label_df = pd.DataFrame(label_results)
    temp_label_df['category'] = curr_category
    temp_label_df.reset_index(inplace=True)
    temp_label_df.rename(columns={curr_category: "ml_labels", "index": "id"}, inplace=True)

    temp_ml_df = pd.merge(temp_caption_df,temp_label_df, how='inner',on=['id', 'category'])

    all_ml_df = pd.concat([all_ml_df,temp_ml_df], axis=0)

csv_buffer = StringIO()
all_ml_df.to_csv(csv_buffer, index=False)


file_path = "/data/outputs_50/ml_labels_preprocessed.csv"
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

print(f"DataFrame saved as CSV and uploaded to {file_path} successfully.")


In [None]:
file_path = "/data/outputs_50/ml_labels_preprocessed.csv"
response = s3.get_object(Bucket=bucket_name, Key=file_path)
csv_content = response['Body'].read().decode('utf-8')
all_ml_df = pd.read_csv(StringIO(csv_content))
all_ml_df

### Calculate Average Number of Words Per Caption and Average Number of Characters Per Caption

In [None]:
# Calculate average words per caption
all_ml_df['word_count'] = all_ml_df['ml_captions'].apply(lambda x: len(x.split()))
average_words_per_caption = all_ml_df['word_count'].mean()

# Calculate average characters per caption
all_ml_df['char_count'] = all_ml_df['ml_captions'].apply(lambda x: len(x))
average_characters_per_caption = all_ml_df['char_count'].mean()

# Results
print(f"Average Num of Words per Caption: {average_words_per_caption:.2f}")
print(f"Average Num of Characters per Caption: {average_characters_per_caption:.2f}")


### Calculate Average Number of Words Per Object Label, Average Number of Characters Per Object Label, and Average Num of Object Labels per Prediction

In [None]:
def avg_words_per_label(label_string):
    labels = label_string.split(',')
    total_words = sum(len(label.split()) for label in labels)
    return total_words / len(labels) if len(labels) > 0 else 0

def avg_chars_per_label(label_string):
    labels = label_string.split(',')
    total_chars = sum(len(label) for label in labels)
    return total_chars / len(labels) if len(labels) > 0 else 0

def avg_words_per_row(label_string):
    return len(label_string.split(','))

# Calculate metrics
all_ml_df['avg_words_per_label'] = all_ml_df['ml_labels'].apply(avg_words_per_label)
all_ml_df['avg_chars_per_label'] = all_ml_df['ml_labels'].apply(avg_chars_per_label)
all_ml_df['avg_words_per_row'] = all_ml_df['ml_labels'].apply(avg_words_per_row)

# Overall averages
average_words_per_label = all_ml_df['avg_words_per_label'].mean()
average_chars_per_label = all_ml_df['avg_chars_per_label'].mean()
average_words_per_row = all_ml_df['avg_words_per_row'].mean()

# Results
print(f"Average Num of Words per Object Label: {average_words_per_label:.2f}")
print(f"Average Num of Characters per Object Label: {average_chars_per_label:.2f}")
print(f"Average Num of Object Labels per Prediction: {average_words_per_row:.2f}")

In [None]:
all_ml_df['ml_labels'].iloc[0]

### Merge ML Captions/Labels DF with Downsampled DF

In [None]:
downsampled_ml_df = pd.merge(all_ml_df,downsampled_dataframes, how='inner',left_on=["id","category"], right_on = ["id", "topics"] )
downsampled_ml_df

In [None]:
downsampled_ml_df.category.value_counts()

In [None]:
len(downsampled_ml_df.id.unique())

In [None]:
downsampled_human_df

### Merge Downsampled ML and Human Dataframes

In [None]:
downsamped_all_df = pd.merge(downsampled_ml_df, downsampled_human_df, how="inner")
downsamped_all_df = downsamped_all_df[['id', 'category', 'country', 'income', 'region', 'human_labels' ,'ml_captions', 'ml_labels']].copy()
downsamped_all_df

In [None]:
# Number of unique labels
print(f"Number of unique ML object labels in dataset: {len(convert_series_to_dict(downsamped_all_df, 'ml_labels'))}")
print(f"Number of unique ML caption labels in dataset: {len(convert_series_to_dict(downsamped_all_df, 'ml_captions'))}")

print(f"Number of unique VIDs in ML dataset: {len(downsamped_all_df.id.unique())}")

In [None]:
# Number of unique labels in the human annotations (nonusa)
print(f"Number of unique labels in human dataset: {len(convert_series_to_dict(downsamped_all_df, 'human_labels'))}")
print(f"Number of unique VIDs in human dataset: {len(downsamped_all_df.id.unique())}")


In [None]:
print(f"Number of unique labels in ML dataset: {len(convert_series_to_dict(downsamped_all_df, 'ml_labels'))}")
print(f"Number of unique VIDs in ML dataset: {len(downsamped_all_df.id.unique())}")

print(f"Number of unique labels in ML dataset: {len(convert_series_to_dict(downsamped_all_df, 'ml_captions'))}")
print(f"Number of unique VIDs in ML dataset: {len(downsamped_all_df.id.unique())}")

print(f"Number of unique labels in human dataset: {len(convert_series_to_dict(downsamped_all_df, 'human_labels'))}")
print(f"Number of unique VIDs in human dataset: {len(downsamped_all_df.id.unique())}")

In [None]:
csv_buffer = StringIO()
downsamped_all_df.to_csv(csv_buffer, index=False)


file_path = "/data/outputs_50/final_combined_ml_human.csv"
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

print(f"DataFrame saved as CSV and uploaded to {file_path} successfully.")

In [None]:
file_path = "/data/outputs_50/final_combined_ml_human.csv"
response = s3.get_object(Bucket=bucket_name, Key=file_path)
csv_content = response['Body'].read().decode('utf-8')
final_combined_ml_human = pd.read_csv(StringIO(csv_content))
final_combined_ml_human

In [None]:
final_combined_ml_human.region.value_counts()

In [None]:
final_combined_ml_human.category.value_counts()