<a href="https://colab.research.google.com/github/mjoshyam/applications-portfolio/blob/main/emotion_dataset_persona_builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan  2 21:17:52 2026

In [None]:
@author: Daku
"""
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
from scipy.sparse import hstack, csr_matrix

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

--------------------------Loading and cleaning the original Kaggle dataset----------------------#

In [None]:
DATA_PATH = "/Users/Daku/Desktop/OpenAI_Residency/Datasets/emotion_sentimen_dataset.csv"
df = pd.read_csv(DATA_PATH)

In [None]:
df = df.drop(columns=["Unnamed: 0"], errors="ignore")
df = df.dropna(subset=["text", "Emotion"]).copy()
df["text"] = df["text"].astype(str).str.strip()

In [None]:
print(df.shape)
print(df["Emotion"].value_counts().head(10))

---------------Loading the csv containing digitized flash cards from "Box of Emotions"-----------# <br>
------------------------The cards will be treated as theory anchors------------------------------#

In [None]:
CARDS_PATH = "/Users/Daku/Desktop/OpenAI_Residency/Datasets/cards.csv"
cards = pd.read_csv(CARDS_PATH).dropna(subset=["card_name", "definition"]).copy()
cards["card_name"] = cards["card_name"].astype(str).str.strip()
cards["system"] = cards["system"].astype(str).str.strip()
cards["definition"] = cards["definition"].astype(str).str.strip()
print(cards.shape)
print(cards["system"].value_counts())

------------------Vectorize the card definitions into a card embeddings space--------------------#

In [None]:
card_vec = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1,2),
    min_df=1
)
C = card_vec.fit_transform(cards["definition"])

------------Create psychological and behavioral cue features from text-----------------------# <br>
Simple lexicons (TODO: expand this)

In [None]:
NEGATIONS = set(["not","no","never","none","nothing","n't"])
MODALS = set(["should","must","need","have to","can't","cannot","could","might","may"])
TIME_PAST = re.compile(r"\b(was|were|had|did|ago|yesterday|before)\b", re.I)
TIME_FUTURE = re.compile(r"\b(will|gonna|going to|tomorrow|next|soon|might)\b", re.I)

In [None]:
MORAL = re.compile(r"\b(deserve|fault|blame|wrong|should|ought|fair|unfair|shame|guilt)\b", re.I)
THREAT = re.compile(r"\b(threat|danger|scared|fear|panic|terrified|unsafe)\b", re.I)
LOSS = re.compile(r"\b(miss|lost|gone|grief|sad|lonely|heartbroken)\b", re.I)
COMPARE = re.compile(r"\b(better|worse|than|others|they have|why them)\b", re.I)

In [None]:
def cue_features(text: str) -> np.ndarray:
    t = text.lower()
    tokens = re.findall(r"[a-z']+", t)
    n = max(len(tokens), 1)

    # pronoun focus
    i_cnt = sum(tok in ("i","me","my","mine") for tok in tokens)
    you_cnt = sum(tok in ("you","your","yours") for tok in tokens)
    they_cnt = sum(tok in ("they","them","their","theirs") for tok in tokens)

    # simple counts
    neg = sum(tok in NEGATIONS for tok in tokens)
    exclam = text.count("!")
    ques = text.count("?")
    caps = sum(1 for ch in text if ch.isalpha() and ch.isupper())
    alpha = sum(1 for ch in text if ch.isalpha())
    caps_ratio = caps / max(alpha, 1)

    # pattern hits
    past = 1 if TIME_PAST.search(text) else 0
    future = 1 if TIME_FUTURE.search(text) else 0
    moral = 1 if MORAL.search(text) else 0
    threat = 1 if THREAT.search(text) else 0
    loss = 1 if LOSS.search(text) else 0
    compare = 1 if COMPARE.search(text) else 0
    return np.array([
        len(tokens),                 # length
        i_cnt / n, you_cnt / n, they_cnt / n,
        neg / n,
        exclam, ques,
        caps_ratio,
        past, future,
        moral, threat, loss, compare
    ], dtype=float)

In [None]:
def build_cue_matrix(texts: pd.Series, batch_size=50000) -> csr_matrix:
    feats = []
    for start in tqdm(range(0, len(texts), batch_size)):
        chunk = texts.iloc[start:start+batch_size]
        arr = np.vstack([cue_features(x) for x in chunk])
        feats.append(csr_matrix(arr))
    return csr_matrix(np.vstack([f.toarray() for f in feats]))

For a first run, sample (scale up after debugging)<br>
sample = df.sample(n=min(200000, len(df)), random_state=42).reset_index(drop=True)

In [None]:
X_cues = build_cue_matrix(df["text"])
print(X_cues.shape)

-----Learning expression of emotions (Linguistic Expression Model)------------#

In [None]:
text_vec = HashingVectorizer(
    n_features=2**20,
    alternate_sign=False,
    ngram_range=(1,2),
    token_pattern=r"(?u)\b[\w']+\b"
)
X_text = text_vec.transform(df["text"])
y = df["Emotion"].values

In [None]:
X = hstack([X_text, X_cues]).tocsr()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
clf = SGDClassifier(loss="log_loss", alpha=1e-6, max_iter=200, n_jobs=-1)
clf.fit(X_train, y_train)

In [None]:
print("Baseline emotion model trained.")
print("Test accuracy (rough):", clf.score(X_test, y_test))

----------------Interrogate dataset against card theory---------------------#<br>
a) Build card similarity score on each text <br>
Card-space vectorizer: fit on (cards + data sample) so vocabulary covers both

In [None]:
card_space_vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=5)

In [None]:
combined = pd.concat([cards["definition"], df["text"]], ignore_index=True)
card_space_vec.fit(combined)

In [None]:
C = card_space_vec.transform(cards["definition"])
T = card_space_vec.transform(df["text"])

Similarity: each text gets similarity to each card definition

In [None]:
S = cosine_similarity(T, C) # this cannot be done for the full dataset as this
# breaks and needs chunking to avoid S vector to have ~63 million floats
print("Cosing similaraity completed!")
# --------Extract top-k card lenses per text (the theory projection)-----#

In [None]:
card_names = cards["card_name"].tolist()

In [None]:
TOPK = 3
topk_idx = np.argsort(-S, axis=1)[:, :TOPK]
topk_cards = [[card_names[j] for j in row] for row in topk_idx]
topk_scores = np.take_along_axis(S, topk_idx, axis=1)

In [None]:
df["theory_cards_top3"] = [";".join(x) for x in topk_cards]
df["theory_top1"] = [x[0] for x in topk_cards]
df["theory_top1_score"] = topk_scores[:,0]

In [None]:
df[["Emotion", "theory_top1", "theory_top1_score"]].head(10)

ecompose expressions into psychological and behavioral cues (structured inference)------------#

In [None]:
cue_cols = [
    "len_tokens","i_ratio","you_ratio","they_ratio","neg_ratio",
    "exclam","ques","caps_ratio","past","future","moral","threat","loss","compare"
]

In [None]:
cue_df = pd.DataFrame(X_cues.toarray(), columns=cue_cols)
tmp = pd.concat([df[["Emotion","theory_top1"]], cue_df], axis=1)

In [None]:
summary = tmp.groupby(["Emotion","theory_top1"])[cue_cols].mean().sort_values("threat", ascending=False)
summary.head(20)

--------Build personas from mechanisms-------------------#

---- a) Create a “persona feature matrix-----------# <br>
Reduce the 79-d theory similarity into smaller dimensions for clustering stability

In [None]:
svd = TruncatedSVD(n_components=25, random_state=42)
S_reduced = svd.fit_transform(S)  # [N, 25]

In [None]:
X_persona = hstack([X_cues, csr_matrix(S_reduced)]).tocsr()
print(X_persona.shape)

---- b) Cluster into Personas -------------------#

In [None]:
k = 8  # start with 6–10; tune later via stability + interpretability
km = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=4096, n_init="auto")
persona_id = km.fit_predict(X_persona)

In [None]:
df["persona_id"] = persona_id
df["persona_id"].value_counts()

-------Make Personas Interpretable---------------#

-------naming personas using top theory cards, cue profiles and texts----#

In [None]:
def top_items(series, n=5):
    return series.value_counts().head(n).to_dict()

In [None]:
persona_reports = []
for pid in sorted(df["persona_id"].unique()):
    sub = df[df["persona_id"] == pid]

    # Top theory lenses
    top_theory = top_items(sub["theory_top1"], n=7)

    # Cue means
    cue_means = cue_df.loc[sub.index].mean().to_dict()

    # Representative examples (highest top1 score)
    ex = sub.sort_values("theory_top1_score", ascending=False).head(3)["text"].tolist()

    persona_reports.append({
        "persona_id": pid,
        "size": len(sub),
        "top_theory_cards": top_theory,
        "cue_means": cue_means,
        "examples": ex
    })

In [None]:
persona_reports[0]["top_theory_cards"], persona_reports[0]["examples"][:1]