In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Calculate Readability Metrics for Components of selfMA_Generated Dataset (Generated Non-Micros vs. Original selfMA Micros)

In [None]:
import pandas as pd


generated_micro_train = pd.read_csv("/content/drive/MyDrive/266_project/selfma_generated_combined_train_split.csv")
generated_micro_validation = pd.read_csv("/content/drive/MyDrive/266_project/selfma_generated_combined_validation_split.csv")
generated_micro_test = pd.read_csv("/content/drive/MyDrive/266_project/selfma_generated_combined_test_split.csv")


In [None]:
#combine train, valid, test into one


generated_combined = pd.concat([generated_micro_train, generated_micro_validation, generated_micro_test], ignore_index=True)


In [None]:
#sanity check

generated_combined.head()


Unnamed: 0,text,label
0,Women and men are equally capable and intellig...,0
1,People should be free to express themselves an...,0
2,"At least, this time, Muslims are not privileged.",1
3,Which country are you from?,0
4,"You don't want to buy a house, it's too much t...",1


In [None]:
#split into two datasets so we can test reading level of each

df_generated = generated_combined[generated_combined["label"] == 0].copy()
df_original = generated_combined[generated_combined["label"] == 1].copy()

print("Generated Text Dataset Shape:", df_generated.shape)
print("Original Text Dataset Shape:", df_original.shape)

Generated Text Dataset Shape: (1300, 2)
Original Text Dataset Shape: (1300, 2)


In [None]:
#sanity check
df_original.head()

Unnamed: 0,text,label
2,"At least, this time, Muslims are not privileged.",1
4,"You don't want to buy a house, it's too much t...",1
7,HI! Could you please follow us and help us get...,1
9,"I'm male, and am asking this question to non-b...",1
10,You smell different. I have always dated Cauca...,1


In [None]:
#reset index
df_generated = df_generated.reset_index(drop=True)
df_original  = df_original.reset_index(drop=True)


In [None]:
#sanity check
df_original.head()

Unnamed: 0,text,label
0,"At least, this time, Muslims are not privileged.",1
1,"You don't want to buy a house, it's too much t...",1
2,HI! Could you please follow us and help us get...,1
3,"I'm male, and am asking this question to non-b...",1
4,You smell different. I have always dated Cauca...,1


In [None]:
#installs for readability tests

!pip install -q textstat tqdm

import numpy as np
from tqdm import tqdm
import textstat
import math

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m88.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:


def safe_metric(func, text):
    try:
        return func(text)
    except Exception:
        return np.nan

def readability_metrics(text: str):
    text = str(text).strip()
    return {
        "flesch_reading_ease":         safe_metric(textstat.flesch_reading_ease, text),
        "flesch_kincaid_grade":        safe_metric(textstat.flesch_kincaid_grade, text),
        "gunning_fog":                 safe_metric(textstat.gunning_fog, text),
        "smog_index":                  safe_metric(textstat.smog_index, text),
        "coleman_liau_index":          safe_metric(textstat.coleman_liau_index, text),
        "automated_readability_index": safe_metric(textstat.automated_readability_index, text),
        "dale_chall_readability":      safe_metric(textstat.dale_chall_readability_score, text),
        "linsear_write":               safe_metric(textstat.linsear_write_formula, text),
        "difficult_words":             safe_metric(textstat.difficult_words, text),
        "avg_sentence_length":         safe_metric(textstat.avg_sentence_length, text),
        "avg_syllables_per_word":      safe_metric(textstat.avg_syllables_per_word, text),
        "text_standard_grade":         safe_metric(lambda t: textstat.text_standard(t, float_output=False), text),
        "word_count":                  len(text.split()) if text else 0,
        "char_count":                  len(text),
    }

def compute_readability_df(df_in: pd.DataFrame, text_col="text") -> pd.DataFrame:
    rows = []
    for t in tqdm(df_in[text_col].tolist(), desc=f"Computing readability for '{text_col}'"):
        rows.append(readability_metrics(t))
    return pd.DataFrame(rows)


# Run on the two datasets


read_orig = compute_readability_df(df_original, text_col="text")
read_gen  = compute_readability_df(df_generated, text_col="text")

# Attach metrics back to the original rows (suffix to avoid name collisions)
df_orig_metrics = pd.concat([df_original.reset_index(drop=True), read_orig.add_suffix("_orig")], axis=1)
df_gen_metrics  = pd.concat([df_generated.reset_index(drop=True),  read_gen.add_suffix("_gen")],  axis=1)

# =========================
# Summary comparison
# =========================
NUMERIC_COLS = [
    "flesch_reading_ease",
    "flesch_kincaid_grade",
    "gunning_fog",
    "smog_index",
    "coleman_liau_index",
    "automated_readability_index",
    "dale_chall_readability",
    "linsear_write",
    "difficult_words",
    "avg_sentence_length",
    "avg_syllables_per_word",
    "word_count",
    "char_count",
]

summary_gen = pd.DataFrame({
    "metric": NUMERIC_COLS,
    "Original_mean":  [read_orig[c].mean() for c in NUMERIC_COLS],
    "Generated_mean": [read_gen[c].mean()  for c in NUMERIC_COLS],
})
summary_gen["Delta(Gen-Orig)"] = summary_gen["Generated_mean"] - summary_gen["Original_mean"]

print("\n=== Readability Comparison: Original vs Generated ===")
print(summary_gen)



  return func(text)
Computing readability for 'text': 100%|██████████| 1300/1300 [00:02<00:00, 535.78it/s] 
Computing readability for 'text': 100%|██████████| 1300/1300 [00:01<00:00, 1293.77it/s]


=== Readability Comparison: Original vs Generated ===
                         metric  Original_mean  Generated_mean  \
0           flesch_reading_ease      85.381980       72.480116   
1          flesch_kincaid_grade       3.751233        5.543826   
2                   gunning_fog       6.065988        7.482917   
3                    smog_index       6.312119        7.768108   
4            coleman_liau_index       4.094519        6.246904   
5   automated_readability_index       4.284074        6.020216   
6        dale_chall_readability       6.465403        7.657453   
7                 linsear_write       4.537381        5.141462   
8               difficult_words       1.852308        2.214615   
9           avg_sentence_length       9.664685        9.636679   
10       avg_syllables_per_word       1.319662        1.472502   
11                   word_count      15.467692       11.022308   
12                   char_count      82.409231       63.056923   

    Delta(Gen-Orig) 




#Workplace MA Readability Scores--Difference between Microaggressions and Non-Microaggressions

In [None]:
micro_agg_url = "https://huggingface.co/spaces/khanak27/microaggressionsdetector/resolve/main/micro_agg.csv"
# Try different encodings to handle Unicode issues
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-8-sig']

df_micro = None
for encoding in encodings_to_try:
    try:
        print(f"Trying encoding: {encoding}")
        df_micro = pd.read_csv(micro_agg_url, encoding=encoding)
        print(f"✅ Successfully loaded with {encoding} encoding")
        break
    except UnicodeDecodeError as e:
        print(f"❌ Failed with {encoding}: {str(e)[:100]}...")
        continue
    except Exception as e:
        print(f"❌ Other error with {encoding}: {str(e)[:100]}...")
        continue

if df_micro is None:
    print("❌ Failed to load dataset with any encoding. Trying with error handling...")
    try:
        df_micro = pd.read_csv(micro_agg_url, encoding='utf-8', encoding_errors='replace')
        print("✅ Loaded with UTF-8 and error replacement")
    except Exception as e:
        print(f"❌ Final attempt failed: {e}")
        raise

Trying encoding: utf-8
❌ Failed with utf-8: 'utf-8' codec can't decode byte 0xe2 in position 17: invalid continuation byte...
Trying encoding: latin-1
✅ Successfully loaded with latin-1 encoding


In [None]:
#EDA
df_micro.head()

Unnamed: 0,speech,label
0,You're very articulate for someone like you.,1
1,Where are you really from?,1
2,You're not like other girls.,1
3,You must be good at math since you're Asian.,1
4,You're too pretty to be a software engineer.,1


In [None]:
df_micro.to_csv("df_micro.csv", index=False)

In [None]:
#split into two datasets so we can test reading level of each

workplace_micros = df_micro[df_micro["label"] == 1].copy()
workplace_non_micros = df_micro[df_micro["label"] == 0].copy()

print("Micros Dataset Shape:", workplace_micros.shape)
print("Non-Micros Dataset Shape:", workplace_non_micros.shape)

Micros Dataset Shape: (84, 2)
Non-Micros Dataset Shape: (87, 2)


In [None]:
#sanity check
workplace_non_micros.head()

Unnamed: 0,speech,label
84,Feel free to suggest any edits.,0
85,Let me know your availability for review.,0
86,I'll push the changes to GitHub.,0
87,Please add your comments by EOD.,0
88,We'll walk through the changes tomorrow.,0


In [None]:
#reset index
workplace_non_micros = workplace_non_micros.reset_index(drop=True)
workplace_micros  = workplace_micros.reset_index(drop=True)


In [None]:
#sanity check
workplace_non_micros.head()

Unnamed: 0,speech,label
0,Feel free to suggest any edits.,0
1,Let me know your availability for review.,0
2,I'll push the changes to GitHub.,0
3,Please add your comments by EOD.,0
4,We'll walk through the changes tomorrow.,0


In [None]:
#sanity check
workplace_micros.head()

Unnamed: 0,speech,label
0,You're very articulate for someone like you.,1
1,Where are you really from?,1
2,You're not like other girls.,1
3,You must be good at math since you're Asian.,1
4,You're too pretty to be a software engineer.,1


In [None]:
#compute readability metrics for WorkplaceMA
def safe_metric(func, text):
    try:
        return func(text)
    except Exception:
        return np.nan

def readability_metrics(text: str):
    text = str(text).strip()
    return {
        "flesch_reading_ease":         safe_metric(textstat.flesch_reading_ease, text),
        "flesch_kincaid_grade":        safe_metric(textstat.flesch_kincaid_grade, text),
        "gunning_fog":                 safe_metric(textstat.gunning_fog, text),
        "smog_index":                  safe_metric(textstat.smog_index, text),
        "coleman_liau_index":          safe_metric(textstat.coleman_liau_index, text),
        "automated_readability_index": safe_metric(textstat.automated_readability_index, text),
        "dale_chall_readability":      safe_metric(textstat.dale_chall_readability_score, text),
        "linsear_write":               safe_metric(textstat.linsear_write_formula, text),
        "difficult_words":             safe_metric(textstat.difficult_words, text),
        "avg_sentence_length":         safe_metric(textstat.avg_sentence_length, text),
        "avg_syllables_per_word":      safe_metric(textstat.avg_syllables_per_word, text),
        "text_standard_grade":         safe_metric(lambda t: textstat.text_standard(t, float_output=False), text),
        "word_count":                  len(text.split()) if text else 0,
        "char_count":                  len(text),
    }

def compute_readability_df(df_in: pd.DataFrame, text_col="speech") -> pd.DataFrame:
    rows = []
    for t in tqdm(df_in[text_col].tolist(), desc=f"Computing readability for '{text_col}'"):
        rows.append(readability_metrics(t))
    return pd.DataFrame(rows)


# Run on the two datasets


read_micros = compute_readability_df(workplace_micros, text_col="speech")
read_non_micros  = compute_readability_df(workplace_non_micros, text_col="speech")

# Attach metrics back to the original rows (suffix to avoid name collisions)
workplace_micros_metrics = pd.concat([workplace_micros.reset_index(drop=True), read_micros.add_suffix("_orig")], axis=1)
workplace_non_micros_metrics  = pd.concat([workplace_non_micros.reset_index(drop=True),  read_non_micros.add_suffix("_gen")],  axis=1)

# =========================
# Summary comparison
# =========================
NUMERIC_COLS = [
    "flesch_reading_ease",
    "flesch_kincaid_grade",
    "gunning_fog",
    "smog_index",
    "coleman_liau_index",
    "automated_readability_index",
    "dale_chall_readability",
    "linsear_write",
    "difficult_words",
    "avg_sentence_length",
    "avg_syllables_per_word",
    "word_count",
    "char_count",
]

summary = pd.DataFrame({
    "metric": NUMERIC_COLS,
    "Workplace Microaggressions Mean":  [read_micros[c].mean() for c in NUMERIC_COLS],
    "Workplace Non-Micros Mean": [read_non_micros[c].mean()  for c in NUMERIC_COLS],
})
summary["Delta(Micro-Nonmicro)"] = summary["Workplace Non-Micros Mean"] - summary["Workplace Microaggressions Mean"]

print("\n=== Readability Comparison: Microaggressions vs Non-Microaggressions ===")
print(summary)



  return func(text)
Computing readability for 'speech': 100%|██████████| 84/84 [00:00<00:00, 3305.48it/s]
Computing readability for 'speech': 100%|██████████| 87/87 [00:00<00:00, 2812.56it/s]


=== Readability Comparison: Microaggressions vs Non-Microaggressions ===
                         metric  Workplace Microaggressions Mean  \
0           flesch_reading_ease                        80.335916   
1          flesch_kincaid_grade                         3.662947   
2                   gunning_fog                         6.566289   
3                    smog_index                         6.572495   
4            coleman_liau_index                         5.528563   
5   automated_readability_index                         4.674707   
6        dale_chall_readability                         5.954461   
7                 linsear_write                         2.904762   
8               difficult_words                         0.880952   
9           avg_sentence_length                         6.476190   
10       avg_syllables_per_word                         1.417562   
11                   word_count                         6.523810   
12                   char_count           




#Compare Differences in Readability Scores between Components (Micro/Non-Micro) of WorkplaceMA and Componenets (Generated/Original Text) of self_MA Generated

In [None]:
# Extract means from previously computed summary dataframes, ensuring they are indexed by the metric names
workplace_micro_means = summary.set_index("metric")["Workplace Microaggressions Mean"]
workplace_non_micro_means = summary.set_index("metric")["Workplace Non-Micros Mean"]
orig_means = summary_gen.set_index("metric")["Original_mean"]
gen_means = summary_gen.set_index("metric")["Generated_mean"]

# Build ratio Series aligned by metric name (index)
work_ratio = (workplace_non_micro_means / workplace_micro_means).replace([np.inf, -np.inf], np.nan)
work_ratio.name = "Workplace_Ratio (Nonmicro / Micro)"

origgen_ratio = (gen_means / orig_means).replace([np.inf, -np.inf], np.nan)
origgen_ratio.name = "Original_Generated_Ratio (Generated / Original)"

# Combine into one flat table
ratio_table = pd.concat([work_ratio, origgen_ratio], axis=1).reset_index()
ratio_table = ratio_table.rename(columns={"index": "metric"})

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

print("\n======================= Ratio Table: WorkplaceMA & Generated Microaggressions Dataset =======================")
print(ratio_table.to_string(index=False))


                     metric  Workplace_Ratio (Nonmicro / Micro)  Original_Generated_Ratio (Generated / Original)
        flesch_reading_ease                            0.928707                                         0.848892
       flesch_kincaid_grade                            1.243482                                         1.477868
                gunning_fog                            0.997121                                         1.233586
                 smog_index                            1.011155                                         1.230666
         coleman_liau_index                            1.365443                                         1.525675
automated_readability_index                            1.184941                                         1.405255
     dale_chall_readability                            1.445074                                         1.184374
              linsear_write                            1.072357                                