### Toxigen SelfMA ###

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

from datasets import load_from_disk
balanced_selfMA_ds = load_from_disk("/content/drive/MyDrive/266_project/balanced_selfMA_ds_toxigen_quote")

balanced_selfMA_ds = balanced_selfMA_ds.rename_column('text', 'cleaned_text')

all_data = pd.concat([
    balanced_selfMA_ds['train'].to_pandas(),
    balanced_selfMA_ds['validation'].to_pandas(),
    balanced_selfMA_ds['test'].to_pandas()
])

selfma_texts = all_data[all_data['label'] == 1]['cleaned_text'].tolist()  # All microaggressions = SelfMA
toxigen_texts = all_data[all_data['label'].isin([0, 2])]['cleaned_text'].tolist()  # Benign + toxic = ToxiGen

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
all_texts = selfma_texts + toxigen_texts
vectors = vectorizer.fit_transform(all_texts)

selfma_vectors = vectors[:len(selfma_texts)]
toxigen_vectors = vectors[len(selfma_texts):]

selfma_sim = np.mean(cosine_similarity(selfma_vectors, selfma_vectors))
toxigen_sim = np.mean(cosine_similarity(toxigen_vectors, toxigen_vectors))
cross_sim = np.mean(cosine_similarity(selfma_vectors, toxigen_vectors))

avg_within = (selfma_sim + toxigen_sim) / 2
ratio = cross_sim / avg_within

if ratio > 0.85:
    print(f"‚úÖ Ratio = {ratio:.2f} > 0.85")
    print("   Datasets are VERY similar stylistically.")
    print("   Strong evidence that model is learning semantic patterns, not style.")
elif ratio > 0.70:
    print(f"‚ö†Ô∏è  Ratio = {ratio:.2f} (0.70-0.85)")
    print("   Datasets have moderate stylistic similarity.")
    print("   Model could be learning both semantics and some style.")
else:
    print(f"‚ùå Ratio = {ratio:.2f} < 0.70")
    print("   Datasets are quite different stylistically.")
    print("   Risk of model learning dataset artifacts rather than semantics.")

Mounted at /content/drive
‚ùå Ratio = 0.59 < 0.70
   Datasets are quite different stylistically.
   Risk of model learning dataset artifacts rather than semantics.


In [None]:
def text_stats(texts, name):
    """Calculate basic statistics about text"""
    lengths = [len(text.split()) for text in texts]
    char_lengths = [len(text) for text in texts]

    print(f"\n{name} Statistics:")
    print(f"  Average words per text:      {np.mean(lengths):.1f}")
    print(f"  Average characters per text: {np.mean(char_lengths):.1f}")
    print(f"  Median words:                {np.median(lengths):.1f}")
    print(f"  Min words:                   {min(lengths)}")
    print(f"  Max words:                   {max(lengths)}")

print("="*70)
print("METHOD 1: BASIC TEXT STATISTICS")
print("="*70)
text_stats(selfma_texts, "SelfMA")
text_stats(toxigen_texts, "ToxiGen")

METHOD 1: BASIC TEXT STATISTICS

SelfMA Statistics:
  Average words per text:      15.5
  Average characters per text: 82.4
  Median words:                12.0
  Min words:                   1
  Max words:                   416

ToxiGen Statistics:
  Average words per text:      18.1
  Average characters per text: 94.9
  Median words:                18.0
  Min words:                   1
  Max words:                   102


In [None]:
def get_vocab(texts):
    """Get unique words from texts"""
    all_words = []
    for text in texts:
        words = text.lower().split()
        all_words.extend(words)
    return set(all_words)

selfma_vocab = get_vocab(selfma_texts)
toxigen_vocab = get_vocab(toxigen_texts)

vocab_overlap = len(selfma_vocab & toxigen_vocab)  # Intersection
vocab_union = len(selfma_vocab | toxigen_vocab)    # Union
jaccard = vocab_overlap / vocab_union

print("\n" + "="*70)
print("METHOD 2: VOCABULARY OVERLAP")
print("="*70)
print(f"SelfMA unique words:    {len(selfma_vocab):,}")
print(f"ToxiGen unique words:   {len(toxigen_vocab):,}")
print(f"Shared words:           {vocab_overlap:,}")
print(f"Jaccard similarity:     {jaccard:.3f}")
print(f"\nüí° Jaccard > 0.5 = high overlap (similar vocabularies)")


METHOD 2: VOCABULARY OVERLAP
SelfMA unique words:    4,223
ToxiGen unique words:   6,193
Shared words:           1,673
Jaccard similarity:     0.191

üí° Jaccard > 0.5 = high overlap (similar vocabularies)


In [None]:
from collections import Counter

def top_words(texts, n=20):
    """Get most common words"""
    all_words = []
    for text in texts:
        words = text.lower().split()
        # Remove common stop words manually
        words = [w for w in words if w not in ['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'is', 'are', 'was', 'were', 'be', 'been']]
        all_words.extend(words)

    counter = Counter(all_words)
    return counter.most_common(n)

selfma_top = top_words(selfma_texts, 15)
toxigen_top = top_words(toxigen_texts, 15)

print("\n" + "="*70)
print("METHOD 3: MOST COMMON WORDS")
print("="*70)
print("\nSelfMA Top 15:")
for word, count in selfma_top:
    print(f"  {word:15s} {count:5d}")

print("\nToxiGen Top 15:")
for word, count in toxigen_top:
    print(f"  {word:15s} {count:5d}")

# Check overlap in top words
selfma_top_words = set([w for w, c in selfma_top])
toxigen_top_words = set([w for w, c in toxigen_top])
top_overlap = len(selfma_top_words & toxigen_top_words)

print(f"\nTop words overlap: {top_overlap}/15")
print(f"üí° More overlap = more similar language use")



METHOD 3: MOST COMMON WORDS

SelfMA Top 15:
  you               765
  i                 514
  that              231
  like              216
  you're            214
  don't             213
  so                205
  your              199
  have              185
  just              183
  not               151
  it                147
  it's              134
  they              133
  do                107

ToxiGen Top 15:
  i                 945
  you               790
  not               740
  they              708
  have              553
  that              502
  it                447
  if                391
  as                347
  people            332
  should            310
  there             253
  who               232
  all               223
  because           219

Top words overlap: 7/15
üí° More overlap = more similar language use


In [None]:
def punctuation_stats(texts, name):
    """Analyze punctuation usage"""
    exclamation = sum(text.count('!') for text in texts) / len(texts)
    question = sum(text.count('?') for text in texts) / len(texts)
    period = sum(text.count('.') for text in texts) / len(texts)
    comma = sum(text.count(',') for text in texts) / len(texts)

    print(f"\n{name} Punctuation (avg per text):")
    print(f"  ! (exclamation): {exclamation:.2f}")
    print(f"  ? (question):    {question:.2f}")
    print(f"  . (period):      {period:.2f}")
    print(f"  , (comma):       {comma:.2f}")

    return exclamation, question, period, comma

print("\n" + "="*70)
print("METHOD 4: PUNCTUATION PATTERNS")
print("="*70)
selfma_punct = punctuation_stats(selfma_texts, "SelfMA")
toxigen_punct = punctuation_stats(toxigen_texts, "ToxiGen")


METHOD 4: PUNCTUATION PATTERNS

SelfMA Punctuation (avg per text):
  ! (exclamation): 0.30
  ? (question):    0.37
  . (period):      1.24
  , (comma):       0.71

ToxiGen Punctuation (avg per text):
  ! (exclamation): 0.02
  ? (question):    0.04
  . (period):      0.29
  , (comma):       0.74


In [None]:
!pip install textstat
import textstat

Collecting textstat
  Downloading textstat-0.7.11-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.11-py3-none-any.whl (176 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m176.4/176.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.1/2.1 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.11


In [None]:
def calculate_readability(texts, name):
    """Calculate multiple readability metrics"""

    # Calculate for each text
    flesch_reading_ease = [textstat.flesch_reading_ease(text) for text in texts]
    flesch_kincaid_grade = [textstat.flesch_kincaid_grade(text) for text in texts]
    dale_chall_readability_score = [textstat.dale_chall_readability_score(text) for text in texts]
    gunning_fog = [textstat.gunning_fog(text) for text in texts]
    smog_index = [textstat.smog_index(text) for text in texts]
    automated_readability = [textstat.automated_readability_index(text) for text in texts]
    coleman_liau = [textstat.coleman_liau_index(text) for text in texts]

    print(f"\n{name} Readability Scores:")
    print(f"  Flesch Reading Ease:             {np.mean(flesch_reading_ease):.1f} (higher = easier)")
    print(f"  Flesch-Kincaid Grade:            {np.mean(flesch_kincaid_grade):.1f} (grade level)")
    print(f"  Dale-Chall Readability Score:    {np.mean(dale_chall_readability_score):.1f} (readability score)")
    print(f"  Gunning Fog Index:               {np.mean(gunning_fog):.1f} (grade level)")
    print(f"  SMOG Index:                      {np.mean(smog_index):.1f} (grade level)")
    print(f"  Automated Readability:           {np.mean(automated_readability):.1f} (grade level)")
    print(f"  Coleman-Liau Index:              {np.mean(coleman_liau):.1f} (grade level)")

    return {
        'flesch_reading_ease': np.mean(flesch_reading_ease),
        'flesch_kincaid_grade': np.mean(flesch_kincaid_grade),
        'dale_chall_readability_score': np.mean(dale_chall_readability_score),
        'gunning_fog': np.mean(gunning_fog),
        'smog': np.mean(smog_index),
        'ari': np.mean(automated_readability),
        'coleman_liau': np.mean(coleman_liau)
    }

print("="*70)
print("READABILITY ANALYSIS")
print("="*70)

selfma_readability = calculate_readability(selfma_texts, "SelfMA")
toxigen_readability = calculate_readability(toxigen_texts, "ToxiGen")


READABILITY ANALYSIS

SelfMA Readability Scores:
  Flesch Reading Ease:             85.4 (higher = easier)
  Flesch-Kincaid Grade:            3.8 (grade level)
  Dale-Chall Readability Score:    6.5 (readability score)
  Gunning Fog Index:               6.1 (grade level)
  SMOG Index:                      6.3 (grade level)
  Automated Readability:           4.3 (grade level)
  Coleman-Liau Index:              4.1 (grade level)

ToxiGen Readability Scores:
  Flesch Reading Ease:             67.1 (higher = easier)
  Flesch-Kincaid Grade:            8.0 (grade level)
  Dale-Chall Readability Score:    7.7 (readability score)
  Gunning Fog Index:               10.2 (grade level)
  SMOG Index:                      9.1 (grade level)
  Automated Readability:           7.7 (grade level)
  Coleman-Liau Index:              6.9 (grade level)


### SBIC SelfMA Readability ###

In [None]:
print("="*70)
print("READABILITY ANALYSIS")
print("="*70)

balanced_selfMA_ds = load_from_disk("/content/drive/MyDrive/266_project/balanced_selfMA_ds")
balanced_selfMA_ds = balanced_selfMA_ds.rename_column('text', 'cleaned_text')

all_data = pd.concat([
    balanced_selfMA_ds['train'].to_pandas(),
    balanced_selfMA_ds['validation'].to_pandas(),
    balanced_selfMA_ds['test'].to_pandas()
])

selfma_texts = all_data[all_data['label'] == 1]['cleaned_text'].tolist()  # All microaggressions = SelfMA
sbic_texts = all_data[all_data['label'] == 0]['cleaned_text'].tolist()  # Benign = SBIC


selfma_readability = calculate_readability(selfma_texts, "SelfMA")
sbic_readability = calculate_readability(sbic_texts, "SBIC")


READABILITY ANALYSIS

SelfMA Readability Scores:
  Flesch Reading Ease:             70.9 (higher = easier)
  Flesch-Kincaid Grade:            7.0 (grade level)
  Dale-Chall Readability Score:    8.1 (readability score)
  Gunning Fog Index:               9.2 (grade level)
  SMOG Index:                      9.5 (grade level)
  Automated Readability:           7.5 (grade level)
  Coleman-Liau Index:              7.0 (grade level)

SBIC Readability Scores:
  Flesch Reading Ease:             69.4 (higher = easier)
  Flesch-Kincaid Grade:            7.4 (grade level)
  Dale-Chall Readability Score:    9.5 (readability score)
  Gunning Fog Index:               9.5 (grade level)
  SMOG Index:                      8.7 (grade level)
  Automated Readability:           8.7 (grade level)
  Coleman-Liau Index:              8.1 (grade level)


### Workplace MA Readability ###

In [None]:
micro_agg_url = "https://huggingface.co/spaces/khanak27/microaggressionsdetector/resolve/main/micro_agg.csv"
# Try different encodings to handle Unicode issues
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-8-sig']

df_micro = None
for encoding in encodings_to_try:
    try:
        print(f"Trying encoding: {encoding}")
        df_micro = pd.read_csv(micro_agg_url, encoding=encoding)
        print(f"‚úÖ Successfully loaded with {encoding} encoding")
        break
    except UnicodeDecodeError as e:
        print(f"‚ùå Failed with {encoding}: {str(e)[:100]}...")
        continue
    except Exception as e:
        print(f"‚ùå Other error with {encoding}: {str(e)[:100]}...")
        continue

if df_micro is None:
    print("‚ùå Failed to load dataset with any encoding. Trying with error handling...")
    try:
        df_micro = pd.read_csv(micro_agg_url, encoding='utf-8', encoding_errors='replace')
        print("‚úÖ Loaded with UTF-8 and error replacement")
    except Exception as e:
        print(f"‚ùå Final attempt failed: {e}")
        raise

Trying encoding: utf-8
‚ùå Failed with utf-8: 'utf-8' codec can't decode byte 0xe2 in position 17: invalid continuation byte...
Trying encoding: latin-1
‚úÖ Successfully loaded with latin-1 encoding


In [None]:
df_micro_positive_labels = df_micro[df_micro['label'] == 1]['speech'].tolist()  # All microaggressions = WorkplaceMA
df_micro_negative_labels = df_micro[df_micro['label'] == 0]['speech'].tolist()  # Benign = WorkplaceMA


df_micro_positive_labels_readability = calculate_readability(df_micro_positive_labels, "Workplace MA - microaggressions")
df_micro_negative_labels_readability = calculate_readability(df_micro_negative_labels, "Workplace MA - nonmicroaggressions")



Workplace MA - microaggressions Readability Scores:
  Flesch Reading Ease:             80.3 (higher = easier)
  Flesch-Kincaid Grade:            3.7 (grade level)
  Dale-Chall Readability Score:    6.0 (readability score)
  Gunning Fog Index:               6.6 (grade level)
  SMOG Index:                      6.6 (grade level)
  Automated Readability:           4.7 (grade level)
  Coleman-Liau Index:              5.5 (grade level)

Workplace MA - nonmicroaggressions Readability Scores:
  Flesch Reading Ease:             74.6 (higher = easier)
  Flesch-Kincaid Grade:            4.6 (grade level)
  Dale-Chall Readability Score:    8.6 (readability score)
  Gunning Fog Index:               6.5 (grade level)
  SMOG Index:                      6.6 (grade level)
  Automated Readability:           5.5 (grade level)
  Coleman-Liau Index:              7.5 (grade level)
