<a href="https://colab.research.google.com/github/mjoshyam/applications-portfolio/blob/main/emotion_dataset_persona_builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan  2 21:17:52 2026"""
# Author: Manasa Joshyam

'\nCreated on Fri Jan  2 21:17:52 2026'

### 1. Imports

In [35]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# 2. Load and clean the original dataset

In [36]:
DATA_PATH = "/content/emotion_sentimen_dataset.csv"
emotion_ds = pd.read_csv(DATA_PATH)
# dropping unnamed columns in original dataset
emotion_ds = emotion_ds.drop(columns=["Unnamed: 0"], errors="ignore")
# dropping NA values in the text and emotion columns
emotion_ds = emotion_ds.dropna(subset=["text", "Emotion"]).copy()
# enforcing type safety for string and removing trailing and leading spacers, new lines, tabs, etc.
emotion_ds["text"] = emotion_ds["text"].astype(str).str.strip()
print(emotion_ds.shape)
print(emotion_ds["Emotion"].value_counts().head(10))

(839555, 2)
Emotion
neutral       674538
love           39553
happiness      27175
sadness        17481
relief         16729
hate           15267
anger          12336
fun            10075
enthusiasm      9304
surprise        6954
Name: count, dtype: int64


# 3. Theory Anchors

Loading the csv containing digitized flash cards from "Box of Emotions"


In [37]:
CARDS_PATH = "/content/cards_v1_cleaned_full.csv"
cards = pd.read_csv(CARDS_PATH).dropna(subset=["card_name", "definition"]).copy()
# enforcing type safety for string and removing trailing and leading spacers, new lines, tabs, etc.
cards["card_name"] = cards["card_name"].astype(str).str.strip()
cards["system"] = cards["system"].astype(str).str.strip()
cards["definition"] = cards["definition"].astype(str).str.strip()
print(cards.shape)
print(cards["system"].value_counts())

(68, 18)
system
Heartache      8
Angst          7
Heat           7
Enjoyment      6
Loathing       6
Bitterness     6
Ego            6
Zen            6
Bliss          6
Emptiness      5
Possibility    5
Name: count, dtype: int64


# 4. Vectorize the card definitions into a card embeddings space

In [38]:
# Construct a numeric sparse matrix of word importance. Each element in the matrix is a number that is a product of TF and IDF.
# TF (Term Frequency) score designates how often a word occurs
# IDF (Inverse Document Frequency) score designates how rare the occurrence is in other documents
# Intuition: TF x IDF high score: The word is frequent in one document but rare everywhere else (a "signature" word).
card_vec = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1,2),
    min_df=1
)
C = card_vec.fit_transform(cards["definition"])


# 5. Psychological/Behavioral Cue Features from Text

In [39]:
# Simple lexicons that can be expanded

NEGATIONS = set(["not","no","never","none","nothing","n't"])
MODALS = set(["should","must","need","have to","can't","cannot","could","might","may"])
TIME_PAST = re.compile(r"\b(was|were|had|did|ago|yesterday|before)\b", re.I)
TIME_FUTURE = re.compile(r"\b(will|gonna|going to|tomorrow|next|soon|might)\b", re.I)

MORAL = re.compile(r"\b(deserve|fault|blame|wrong|should|ought|fair|unfair|shame|guilt)\b", re.I)
THREAT = re.compile(r"\b(threat|danger|scared|fear|panic|terrified|unsafe)\b", re.I)
LOSS = re.compile(r"\b(miss|lost|gone|grief|sad|lonely|heartbroken)\b", re.I)
COMPARE = re.compile(r"\b(better|worse|than|others|they have|why them)\b", re.I)

In [40]:
def cue_features(text: str) -> np.ndarray:
    t = text.lower()
    tokens = re.findall(r"[a-z']+", t)
    n = max(len(tokens), 1)

    # pronoun focus
    i_cnt = sum(tok in ("i","me","my","mine") for tok in tokens)
    you_cnt = sum(tok in ("you","your","yours") for tok in tokens)
    they_cnt = sum(tok in ("they","them","their","theirs") for tok in tokens)

    # simple counts
    neg = sum(tok in NEGATIONS for tok in tokens)
    exclam = text.count("!")
    ques = text.count("?")
    caps = sum(1 for ch in text if ch.isalpha() and ch.isupper())
    alpha = sum(1 for ch in text if ch.isalpha())
    caps_ratio = caps / max(alpha, 1)

    # pattern hits
    past = 1 if TIME_PAST.search(text) else 0
    future = 1 if TIME_FUTURE.search(text) else 0
    moral = 1 if MORAL.search(text) else 0
    threat = 1 if THREAT.search(text) else 0
    loss = 1 if LOSS.search(text) else 0
    compare = 1 if COMPARE.search(text) else 0
    return np.array([
        len(tokens),                 # length
        i_cnt / n, you_cnt / n, they_cnt / n,
        neg / n,
        exclam, ques,
        caps_ratio,
        past, future,
        moral, threat, loss, compare
    ], dtype=float)

In [41]:
def build_cue_matrix(texts: pd.Series, batch_size=50000) -> csr_matrix:
    feats = []
    for start in tqdm(range(0, len(texts), batch_size)):
        chunk = texts.iloc[start:start+batch_size]
        arr = np.vstack([cue_features(x) for x in chunk])
        feats.append(csr_matrix(arr))
    return csr_matrix(np.vstack([f.toarray() for f in feats]))

# For a first run, sample (scale up after debugging)
#sample = emotion_ds.sample(n=min(200000, len(emotion_ds)), random_state=42).reset_index(drop=True)
X_cues = build_cue_matrix(emotion_ds["text"])
print(X_cues.shape)

100%|██████████| 17/17 [01:06<00:00,  3.91s/it]


(839555, 14)


# 6. Linguistic Expression Model
Learning how emotions are expressed

In [43]:
text_vec = HashingVectorizer(
    n_features=2**20,
    alternate_sign=False,
    ngram_range=(1,2),
    token_pattern=r"(?u)\b[\w']+\b"
)
X_text = text_vec.transform(emotion_ds["text"])
y = emotion_ds["Emotion"].values

X = hstack([X_text, X_cues]).tocsr()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = SGDClassifier(loss="log_loss", alpha=1e-6, max_iter=1000, n_jobs=-1)
clf.fit(X_train, y_train)

print("Baseline emotion model trained.")
print("Test accuracy (rough):", clf.score(X_test, y_test))

Baseline emotion model trained.
Test accuracy (rough): 0.9652434920880705


# 7. Use cards as a theory lens to interpret texts and errors

7A) Build a card similarity score for each text

In [44]:
# Card-space vectorizer: fit on (cards + data sample) so vocabulary covers both
card_space_vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=5)

combined = pd.concat([cards["definition"], emotion_ds["text"]], ignore_index=True)
card_space_vec.fit(combined)

C = card_space_vec.transform(cards["definition"])  # [79, V]
T = card_space_vec.transform(emotion_ds["text"])       # [N, V]

# Similarity: each text gets similarity to each card definition
S = cosine_similarity(T, C)  # [N, 79]

In [45]:
S.shape

(839555, 68)

7B) Extract top-k card lenses per text (the “theory projection”)

In [46]:
card_names = cards["card_name"].tolist()

TOPK = 3
topk_idx = np.argsort(-S, axis=1)[:, :TOPK]
topk_cards = [[card_names[j] for j in row] for row in topk_idx]
topk_scores = np.take_along_axis(S, topk_idx, axis=1)

emotion_ds["theory_cards_top3"] = [";".join(x) for x in topk_cards]
emotion_ds["theory_top1"] = [x[0] for x in topk_cards]
emotion_ds["theory_top1_score"] = topk_scores[:,0]

print(emotion_ds[["Emotion", "theory_top1", "theory_top1_score"]].head(10))



   Emotion theory_top1  theory_top1_score
0     hate      Hatred           0.046123
1  neutral        Fago           0.024254
2  neutral      Regret           0.045607
3    anger     Disgust           0.020991
4  neutral   Self-Pity           0.044900
5     love        Love           0.040288
6  neutral         Zal           0.058141
7    worry     Anxiety           0.106512
8  neutral  Impatience           0.054342
9  neutral        Fago           0.041550


In [47]:
topk_idx.shape

(839555, 3)

# 8) Decompose expressions into psychological / behavioral cues

(structured inference)

In [48]:
cue_cols = [
    "len_tokens","i_ratio","you_ratio","they_ratio","neg_ratio",
    "exclam","ques","caps_ratio","past","future","moral","threat","loss","compare"
]

cue_df = pd.DataFrame(X_cues.toarray(), columns=cue_cols)
tmp = pd.concat([emotion_ds[["Emotion","theory_top1"]], cue_df], axis=1)

summary = tmp.groupby(["Emotion","theory_top1"])[cue_cols].mean().sort_values("moral", ascending=False)
summary.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,len_tokens,i_ratio,you_ratio,they_ratio,neg_ratio,exclam,ques,caps_ratio,past,future,moral,threat,loss,compare
Emotion,theory_top1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
boredom,Melancholy,38.0,0.131579,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
worry,Guilt,30.076923,0.072215,0.003205,0.003846,0.014251,0.0,0.0,0.0,0.230769,0.153846,0.615385,0.0,0.0,0.0
love,Guilt,26.885135,0.111495,0.006719,0.004642,0.00982,0.0,0.0,0.0,0.189189,0.060811,0.310811,0.006757,0.027027,0.067568
sadness,Guilt,22.397959,0.11208,0.003088,0.003618,0.010459,0.0,0.0,0.0,0.183673,0.061224,0.27551,0.020408,0.530612,0.061224
anger,Guilt,22.467532,0.109618,0.008466,0.00456,0.008041,0.0,0.0,0.0,0.168831,0.038961,0.272727,0.012987,0.025974,0.090909
happiness,Guilt,25.900585,0.118138,0.002961,0.003329,0.010116,0.0,0.0,0.0,0.172515,0.093567,0.269006,0.0,0.040936,0.099415
worry,Shame,28.933333,0.107037,0.007255,0.012605,0.012093,0.0,0.0,0.0,0.066667,0.066667,0.266667,0.133333,0.0,0.066667
surprise,Guilt,27.85,0.110769,0.00359,0.007568,0.009208,0.0,0.0,0.0,0.3,0.05,0.25,0.0,0.05,0.0
hate,Guilt,25.264901,0.11259,0.008743,0.00377,0.009084,0.0,0.0,0.0,0.145695,0.092715,0.238411,0.0,0.0,0.066225
empty,Panic,22.555556,0.097131,0.010582,0.003584,0.0,0.0,0.0,0.0,0.0,0.111111,0.222222,0.0,0.0,0.0


# 9) Build personas from mechanisms, not from emotion labels
9A) Create a “persona feature matrix”

In [49]:
# Reduce the 79-d theory similarity into smaller dimensions for clustering stability
svd = TruncatedSVD(n_components=25, random_state=42)
S_reduced = svd.fit_transform(S)  # [N, 25]

X_persona = hstack([X_cues, csr_matrix(S_reduced)]).tocsr()
print(X_persona.shape)


(839555, 39)


9B) Cluster into personas (MiniBatchKMeans scales well)

In [50]:
k = 8  # start with 6–10; tune later via stability + interpretability
km = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=4096, n_init="auto")
persona_id = km.fit_predict(X_persona)
emotion_ds["persona_id"] = persona_id
emotion_ds["persona_id"].value_counts()

Unnamed: 0_level_0,count
persona_id,Unnamed: 1_level_1
2,170087
4,135059
1,128232
6,123827
5,110562
3,102614
0,47930
7,21244


# 10) Make personas interpretable

Name each persona using:

a) top theory cards

b) cue profile (agency/threat/loss/comparison)

c) representative texts

In [53]:
def top_items(series, n=5):
    return series.value_counts().head(n).to_dict()

persona_reports = []
for pid in sorted(emotion_ds["persona_id"].unique()):
    sub = emotion_ds[emotion_ds["persona_id"] == pid]

    # Top theory lenses
    top_theory = top_items(sub["theory_top1"], n=7)

    # Cue means
    cue_means = cue_df.loc[sub.index].mean().to_dict()

    # Representative examples (highest top1 score)
    ex = sub.sort_values("theory_top1_score", ascending=False).head(3)["text"].tolist()

    persona_reports.append({
        "persona_id": pid,
        "size": len(sub),
        "top_theory_cards": top_theory,
        "cue_means": cue_means,
        "examples": ex
    })

persona_reports[0]["top_theory_cards"], persona_reports[0]["examples"][:1]


({'Feeling like a Fruad': 1894,
  'Hopefulness': 1532,
  'Awumbuk': 1422,
  'Desire': 1378,
  'Confidence': 1324,
  'Comfort': 1248,
  'Fago': 1230},
 ['i may go through different stages of grief it may take a while to really feel a strong sense of healing and i know that there will never be a time when i dont miss this person'])

# 11. Visualization of Personas

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

'''def persona_card(pid, n_examples=2):
    sub = emotion_ds[emotion_ds["persona_id"] == pid]
    size = len(sub)
    pct = size / len(emotion_ds)

    top_cards = sub["theory_top1"].value_counts().head(5)
    top_cards_str = ", ".join([f"{k} ({v})" for k, v in top_cards.items()])

    # cue z-scores for this persona: show strongest + and -
    z = cue_z.loc[pid].sort_values(ascending=False)
    top_pos = ", ".join([f"{k}:{z[k]:.2f}" for k in z.index[:4]])
    top_neg = ", ".join([f"{k}:{z[k]:.2f}" for k in z.index[-4:]])

    ex = sub.sort_values("theory_top1_score", ascending=False).head(n_examples)["text"].tolist()

    print(f"\n=== Persona {pid+1} ===")
    print(f"Size: {size} ({pct:.1%})")
    print(f"Top theory cards: {top_cards_str}")
    print(f"Cue highs (z): {top_pos}")
    print(f"Cue lows  (z): {top_neg}")
    print("Examples:")
    for i, t in enumerate(ex, 1):
        print(f"  {i}) {t[:250]}{'...' if len(t) > 250 else ''}")

for pid in sorted(emotion_ds["persona_id"].unique()):
    persona_card(pid, n_examples=2)'''
def persona_card(pid, n_examples=2):
    sub = emotion_ds[emotion_ds["persona_id"] == pid]
    size = len(sub)
    pct = size / len(emotion_ds)

    top_cards = sub["theory_top1"].value_counts().head(5)
    top_cards_str = ", ".join([f"{k} ({v})" for k, v in top_cards.items()])

    z = cue_z.loc[pid].sort_values(ascending=False)
    top_pos = ", ".join([f"{k}:{z[k]:.2f}" for k in z.index[:4]])
    top_neg = ", ".join([f"{k}:{z[k]:.2f}" for k in z.index[-4:]])

    ex = (
        sub.sort_values("theory_top1_score", ascending=False)
           .drop_duplicates(subset=["text"])
           .head(n_examples)["text"]
           .tolist()
    )

    print(f"\n=== Persona {pid+1} ===")
    print(f"Size: {size} ({pct:.1%})")
    print(f"Top theory cards: {top_cards_str}")
    print(f"Cue highs (z): {top_pos}")
    print(f"Cue lows  (z): {top_neg}")
    print("Examples:")
    for i, t in enumerate(ex, 1):
        print(f"  {i}) {t[:250]}{'...' if len(t) > 250 else ''}")

for pid in sorted(emotion_ds["persona_id"].unique()):
    persona_card(pid, n_examples=2)



=== Persona 1 ===
Size: 47930 (5.7%)
Top theory cards: Feeling like a Fruad (1894), Hopefulness (1532), Awumbuk (1422), Desire (1378), Confidence (1324)
Cue highs (z): len_tokens:1.95, past:0.41, future:0.38, compare:0.26
Cue lows  (z): ques:0.00, exclam:0.00, caps_ratio:0.00, i_ratio:-0.48
Examples:
  1) i may go through different stages of grief it may take a while to really feel a strong sense of healing and i know that there will never be a time when i dont miss this person
  2) i said when thanking you last fall i feel a curious combination of pride and humility pride at having started this but humility at the constant reminder that this project is after all staffed by self motivated volunteers

=== Persona 2 ===
Size: 128232 (15.3%)
Top theory cards: Feeling like a Fruad (7210), Comfort (5099), Hopefulness (4075), Awumbuk (3794), Confidence (3655)
Cue highs (z): neg_ratio:0.02, exclam:0.00, caps_ratio:0.00, ques:0.00
Cue lows  (z): compare:-0.04, past:-0.05, future:-0.06, len_to