In [None]:
import pandas as pd

In [None]:
!pip install playwright pandas
!python -m playwright install --with-deps chromium


Collecting playwright
  Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl (45.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.55.0 pyee-13.0.0
Installing dependencies...
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:4 https://cli.github.com/packages stable InRelease
Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:6 https://ppa.launchpadc

In [None]:
import re
from playwright.async_api import async_playwright

TAG_URL = "https://www.microaggressions.com/tagged/race"
TIMEOUT_MS = 30000
SCROLL_PAUSE_MS = 800    # wait after each scroll (tune up/down)
MAX_SCROLLS = 600        # safety cap

# ---------- color helpers ----------
def parse_rgb(s):
    m = re.search(r'rgba?\((\d+),\s*(\d+),\s*(\d+)', s or "")
    return tuple(map(int, m.groups())) if m else (999, 999, 999)

def is_black(rgb):
    r, g, b = rgb
    return r < 22 and g < 22 and b < 22

def is_pink(rgb):
    r, g, b = rgb
    return (
        (r >= 200 and g <= 170 and b >= 110 and r >= b) or
        (r >= 235 and g <= 120 and b >= 150)
    )

GET_EFFECTIVE_BG_JS = """
(node) => {
  function parseRGB(s){
    const m = /rgba?\\((\\d+),\\s*(\\d+),\\s*(\\d+)/.exec(s||"");
    return m ? [parseInt(m[1]), parseInt(m[2]), parseInt(m[3])] : [999,999,999];
  }
  let cur = node;
  while (cur) {
    const cs = window.getComputedStyle(cur);
    const bg = cs.backgroundColor;
    if (bg && !bg.startsWith('rgba(0, 0, 0, 0)') && bg !== 'transparent') {
      return parseRGB(bg);
    }
    cur = cur.parentElement;
  }
  const bodyBG = window.getComputedStyle(document.body).backgroundColor;
  if (bodyBG) return parseRGB(bodyBG);
  return [0,0,0];
}
"""

# ---------- #2: auto-scroll until stable ----------
async def infinite_scroll(page, max_scrolls=MAX_SCROLLS, pause_ms=SCROLL_PAUSE_MS):
    # small nudge first (helps some loaders)
    await page.evaluate("""
      window.scrollTo(0, 0);
      window.dispatchEvent(new Event('scroll'));
      window.dispatchEvent(new Event('resize'));
    """)
    last_height = await page.evaluate("() => document.body.scrollHeight")
    stable = 0
    for _ in range(max_scrolls):
        await page.evaluate("() => window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(pause_ms)
        new_height = await page.evaluate("() => document.body.scrollHeight")
        if new_height == last_height:
            stable += 1
        else:
            stable = 0
        last_height = new_height
        if stable >= 2:   # two consecutive no-growth checks
            break
    # return to top for extraction
    await page.evaluate("() => window.scrollTo(0, 0)")
    await page.wait_for_timeout(400)

# ---------- scraping ----------
async def extract_pink_on_black_quotes(page):
    rows, seen = [], set()

    roots = []
    for sel in ["main", "#content", ".content", ".posts", "article", "body"]:
        root = await page.query_selector(sel)
        if root:
            roots.append(root)
    if not roots:
        roots = [page]

    post_containers = await page.query_selector_all(
        "article, .post, .entry, .post-container, .blog-post, .postContainer"
    )
    container_nodes = post_containers if post_containers else roots

    for container in container_nodes:
        # try to associate a permalink
        permalink = ""
        for a_sel in ['a[rel="bookmark"]', 'h1 a', 'h2 a', '.permalink a', 'a.permalink', 'a[href*="/post/"]']:
            a = await container.query_selector(a_sel)
            if a:
                href = await a.get_attribute("href")
                if href and "microaggressions.com" in href:
                    permalink = href
                    break

        # traverse descendants
        for el in await container.query_selector_all("*"):
            try:
                styles = await el.evaluate("""(node) => {
                    const cs = window.getComputedStyle(node);
                    if (cs.display === 'none' || cs.visibility === 'hidden') return null;
                    const text = (node.innerText || "").trim();
                    return {color: cs.color, text};
                }""")
                if not styles:
                    continue
                text = styles.get("text") or ""
                wc = len(text.split())
                if wc < 3 or wc > 120:
                    continue

                color_rgb = parse_rgb(styles.get("color", ""))
                if not is_pink(color_rgb):
                    continue

                bg_rgb = tuple(await el.evaluate(GET_EFFECTIVE_BG_JS))
                if not is_black(bg_rgb):
                    continue

                q_norm = re.sub(r"\s+", " ", text).strip()
                key = (q_norm.lower(), permalink or "PAGE")
                if key in seen:
                    continue
                seen.add(key)
                rows.append({"quote": q_norm, "permalink": permalink or TAG_URL})
            except Exception:
                continue

    return rows

async def main():
    async with async_playwright() as pw:
        browser = await pw.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(TAG_URL, wait_until="domcontentloaded", timeout=TIMEOUT_MS)

        # #2 auto-scroll to load everything
        await infinite_scroll(page)

        rows = await extract_pink_on_black_quotes(page)
        await browser.close()

    df = pd.DataFrame(rows, columns=["quote","permalink"]).drop_duplicates(subset=["quote","permalink"])
    df.to_csv("microaggressions_race_pink_on_black.csv", index=False)
    print(f"Saved {len(df)} quotes to microaggressions_race_pink_on_black.csv")

# In a notebook, run:
# await main()


In [None]:
await main()


Saved 610 quotes to microaggressions_race_pink_on_black.csv


In [None]:
!ls /content


drive  microaggressions_race_pink_on_black.csv	sample_data


In [None]:
df_microagg_quotes = pd.read_csv("microaggressions_race_pink_on_black.csv")
df_microagg_quotes.head()


Unnamed: 0,quote,permalink
0,"“ So every week, do the Guatemalans come and m...",https://www.microaggressions.com/post/62553519...
1,“ Why would you wear your hair like that today...,https://www.microaggressions.com/post/62526339...
2,"“ You’re Asian, so you must be good at math, r...",https://www.microaggressions.com/post/62508220...
3,"“ Oh, I don’t know how to pronounce those name...",https://www.microaggressions.com/post/62490097...
4,“ You’re very exotic-looking. ”,https://www.microaggressions.com/post/62417630...


In [None]:
# Drop the permalink column
df_microagg_quotes = df_microagg_quotes.drop(columns=["permalink"])

In [None]:
df_microagg_quotes.to_csv("microaggressions_race_pink_on_black_final.csv", index=False)

In [None]:
df_microagg_quotes.head()

Unnamed: 0,speech,label
0,"“ So every week, do the Guatemalans come and m...",1
1,“ Why would you wear your hair like that today...,1
2,"“ You’re Asian, so you must be good at math, r...",1
3,"“ Oh, I don’t know how to pronounce those name...",1
4,“ You’re very exotic-looking. ”,1


In [None]:
#give datasets the same format
# Rename column "quote" to "speech"
df_microagg_quotes = df_microagg_quotes.rename(columns={"quote": "speech"})

# Add a new column called "label" and fill it with 1s
df_microagg_quotes["label"] = 1

# (Optional) check the result
df_microagg_quotes.head()

Unnamed: 0,speech,label
0,"“ So every week, do the Guatemalans come and m...",1
1,“ Why would you wear your hair like that today...,1
2,"“ You’re Asian, so you must be good at math, r...",1
3,"“ Oh, I don’t know how to pronounce those name...",1
4,“ You’re very exotic-looking. ”,1


In [None]:
!ls /content


drive
microaggressions_race_pink_on_black.csv
microaggressions_race_pink_on_black_final.csv
microaggressions_race_pink_on_black_no_permalink.csv
sample_data


In [None]:
from google.colab import files
files.download("microaggressions_race_pink_on_black_final.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/266_project/microaggressions_workplace.csv'

df_microagg_workplace = pd.read_csv(path, encoding='cp1252')

In [None]:
df_microagg_workplace.head()

Unnamed: 0,speech,label
0,You're very articulate for someone like you.,1
1,Where are you really from?,1
2,You're not like other girls.,1
3,You must be good at math since you're Asian.,1
4,You're too pretty to be a software engineer.,1


In [None]:
df_microagg_workplace["label"].unique()


array([1, 0])

In [None]:
#combine datasets

# basic union (rows from both, index reset)
df_microagg_combined = pd.concat([df_microagg_quotes, df_microagg_workplace], ignore_index=True)

In [None]:
df_microagg_combined


Unnamed: 0,speech,label
0,"“ So every week, do the Guatemalans come and m...",1
1,“ Why would you wear your hair like that today...,1
2,"“ You’re Asian, so you must be good at math, r...",1
3,"“ Oh, I don’t know how to pronounce those name...",1
4,“ You’re very exotic-looking. ”,1
...,...,...
776,IÕll check in again by the end of the day.,0
777,Please go ahead and take the lead on this.,0
778,YouÕve handled the updates efficiently.,0
779,LetÕs review the roadmap together tomorrow.,0


In [None]:
# Save the combined dataframe as a CSV file
df_microagg_combined.to_csv("microaggressions_combined.csv", index=False)


In [None]:
from google.colab import files
files.download("microaggressions_combined.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_microagg_combined["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,694
0,87
