In [15]:
from google.colab import files
uploaded = files.upload()

Saving 300ReportsToBeAnnotated-By Rezza.xlsx to 300ReportsToBeAnnotated-By Rezza.xlsx
Saving 300ReportsToBeAnnotated-Mathew.xlsx to 300ReportsToBeAnnotated-Mathew.xlsx
Saving Nicole-300ReportsToBeAnnotated.xlsx to Nicole-300ReportsToBeAnnotated.xlsx


In [18]:
import pandas as pd
import numpy as np

# Load the 3 annotation files
mathew_df = pd.read_excel("300ReportsToBeAnnotated-Mathew.xlsx", sheet_name=0)
rezza_df = pd.read_excel("300ReportsToBeAnnotated-By Rezza.xlsx", sheet_name=0)
nicole_df = pd.read_excel("Nicole-300ReportsToBeAnnotated.xlsx", sheet_name=0)

# Define mapping from actual column names to standardized names
bias_column_mapping = {
    " Gender Bias": "Gender_Bias",
    "Religion Bias": "Religion_Bias",
    "Age Bias": "Age_Bias",
    "Disability Bias": "Disability_Bias",
    "Sexuality Bias": "Sexuality_Bias"
}

bias_types = list(bias_column_mapping.values())

combined_df = mathew_df[['ReportID', 'SentenceNum', 'Sentence_Text', 'Prompt_Type']].copy()
if 'Company' in mathew_df.columns:
    combined_df['Company'] = mathew_df['Company']
else:
    combined_df['Company'] = mathew_df['Prompt_Type']

# Function to calculate confidence based on number of annotations
def get_confidence_level(count):
    if count == 0:
        return 0.0
    elif count == 1:
        return 0.33
    elif count == 2:
        return 0.67
    else:  # count == 3 or 4
        return 1.0

# Function to check if a value represents a positive bias
def is_positive_bias(value):
    if pd.isna(value):
        return False

    try:
        float_val = float(value)
        return float_val > 0
    except:
        if isinstance(value, str):
            return value.strip() in ['1', 'True', 'true', 'Yes', 'yes', 'y', 'Y']
    return False

# Process each bias type
for original_col, standardized_col in bias_column_mapping.items():
    # Initialize final column
    combined_df[f'Final_{standardized_col}'] = 0
    combined_df[f'{standardized_col}_Confidence_Value'] = 0.0

    # Skip if columns don't exist
    if (original_col not in mathew_df.columns or
        original_col not in rezza_df.columns or
        original_col not in nicole_df.columns):
        continue

    print(f"Processing '{original_col}' -> '{standardized_col}'")

    # Process each row
    bias_counter = 0
    for index, row in combined_df.iterrows():
        try:
            # Count positive annotations
            mathew_bias = is_positive_bias(mathew_df.loc[index, original_col])
            rezza_bias = is_positive_bias(rezza_df.loc[index, original_col])
            nicole_bias = is_positive_bias(nicole_df.loc[index, original_col])

            positive_count = mathew_bias + rezza_bias + nicole_bias

            # Set bias flag and confidence
            is_biased = 1 if positive_count > 0 else 0
            confidence = get_confidence_level(positive_count)

            if is_biased:
                bias_counter += 1

            # Update dataframe
            combined_df.at[index, f'Final_{standardized_col}'] = is_biased
            combined_df.at[index, f'{standardized_col}_Confidence_Value'] = confidence

        except Exception as e:
            print(f"Error processing row {index} for {standardized_col}: {e}")

    print(f"Total sentences with {standardized_col}: {bias_counter}")

# 7. Save only the required columns
output_columns = ['ReportID', 'SentenceNum', 'Sentence_Text', 'Prompt_Type']
output_columns += [f'Final_{col}' for col in bias_types]
output_columns += [f'{col}_Confidence_Value' for col in bias_types]

# Filter to existing columns only
output_columns = [col for col in output_columns if col in combined_df.columns]

# Save the final dataset
output_file = "final_combined_annotations_with_confidencevalues.csv"
combined_df[output_columns].to_csv(output_file, index=False)
print(f"\nCombined annotations saved to {output_file}")

# 8. Print summary statistics
print("\nAnnotation Statistics:")
for standardized_col in bias_types:
    col_name = f'Final_{standardized_col}'
    if col_name in combined_df.columns:
        bias_count = combined_df[col_name].sum()
        total = len(combined_df)
        percent = (bias_count / total) * 100

        print(f"\n{standardized_col}:")
        print(f"  Total biased sentences: {bias_count} out of {total} ({percent:.2f}%)")

        # Count by confidence level
        for level, level_name in [(0.33, "Low"), (0.67, "Medium"), (1.0, "High")]:
            count = ((combined_df[col_name] == 1) &
                    (combined_df[f'{standardized_col}_Confidence_Value'] == level)).sum()

            annotator_count = int(level * 3)
            print(f"  {level_name} confidence ({annotator_count}/3 annotators): {count} sentences")

Processing ' Gender Bias' -> 'Gender_Bias'
Total sentences with Gender_Bias: 165
Processing 'Religion Bias' -> 'Religion_Bias'
Total sentences with Religion_Bias: 200
Processing 'Age Bias' -> 'Age_Bias'
Total sentences with Age_Bias: 183
Processing 'Disability Bias' -> 'Disability_Bias'
Total sentences with Disability_Bias: 158
Processing 'Sexuality Bias' -> 'Sexuality_Bias'
Total sentences with Sexuality_Bias: 36

Combined annotations saved to final_combined_annotations_with_confidencevalues.csv

Annotation Statistics:

Gender_Bias:
  Total biased sentences: 165 out of 3299 (5.00%)
  Low confidence (0/3 annotators): 104 sentences
  Medium confidence (2/3 annotators): 54 sentences
  High confidence (3/3 annotators): 7 sentences

Religion_Bias:
  Total biased sentences: 200 out of 3299 (6.06%)
  Low confidence (0/3 annotators): 125 sentences
  Medium confidence (2/3 annotators): 63 sentences
  High confidence (3/3 annotators): 12 sentences

Age_Bias:
  Total biased sentences: 183 out of