In [1]:
import pandas as pd
import numpy as np
from gensim.downloader import load as gensim_load
from sklearn.metrics.pairwise import cosine_similarity

# 0. Load pieman_segments
pieman_segments = pd.read_csv("pieman_segments.csv")

# 1. Load pretrained Word2Vec model
model = gensim_load("glove-wiki-gigaword-300")  # 300d

# 2. Define prototype words for each emotion
emotion_words = {
    "happiness": ["happy", "joy", "delighted", "pleased", "content", "excited"],
    "sadness": ["sad", "unhappy", "miserable", "depressed", "gloomy", "tearful"],
    "anger": ["angry", "mad", "furious", "irritated", "annoyed", "enraged"],
    "fear": ["afraid", "scared", "fearful", "terrified", "nervous", "anxious"],
    "surprise": ["surprised", "shocked", "astonished", "amazed", "startled", "stunned"],
    "disgust": ["disgusted", "repulsed", "gross", "nauseated", "revolted", "sickened"]
}

# 3. Compute prototype vectors
emotion_prototypes = {}
for emo, words in emotion_words.items():
    vecs = [model[w] for w in words if w in model]
    if vecs:
        emotion_prototypes[emo] = np.mean(vecs, axis=0)
    else:
        raise ValueError(f"No valid words found in model for emotion: {emo}")

# 4. Process your TR data
tmp = pieman_segments.copy()
tmp["word_raw"] = tmp["word_raw"].astype(str).str.lower()
tmp = tmp[tmp["word_raw"].notna() & tmp["word_raw"].str.len().gt(0)]

words_by_tr = tmp.groupby("TR", as_index=True)["word_raw"].apply(list).sort_index()

# 5. Get average vector per TR
def get_avg_vector(tokens):
    vecs = [model[w] for w in tokens if w in model]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

tr_embeddings = words_by_tr.apply(get_avg_vector)
tr_embed_df = pd.DataFrame(tr_embeddings.tolist(), index=tr_embeddings.index)
tr_embed_df = tr_embed_df.reset_index().rename(columns={i: f"dim_{i}" for i in range(model.vector_size)})

# 6. Compute cosine similarities with each emotion prototype
def cosine_sim(v1, v2):
    return cosine_similarity(v1.reshape(1, -1), v2.reshape(1, -1))[0][0]

for emo, proto_vec in emotion_prototypes.items():
    tr_embed_df[emo] = tr_embed_df[[f"dim_{i}" for i in range(model.vector_size)]].apply(
        lambda row: cosine_sim(row.to_numpy(), proto_vec), axis=1
    )

# 7. Rolling 2-TR trailing mean
all_tr = pd.DataFrame({"TR": np.arange(pieman_segments["TR"].min(), pieman_segments["TR"].max() + 1)})
tr_scores = all_tr.merge(tr_embed_df[["TR"] + list(emotion_prototypes.keys())], on="TR", how="left").fillna(0.0)

NameError: name 'df' is not defined

In [None]:
# Grab TR by TR for "happiness_w2", "sadness_w2", "anger_w2", "fear_w2", "surprise_w2", "disgust" in TR_scores 
# Also make any rows without a TR (e.g. if we go from 16 to 18 in the TR column) exist and grabbing the previous rows scores for the 6 emo variables
# Train HMM for 2, 3, 4, 5, 6, 7, 8, 9, 10 states (based on the 6 emo variables) and compute log likelihood for each number of states
# Save TR by state csv for best fitting model (BIC)