We're going to merge the new stimuli (generated by 'additional_stimuli.ipynb') and the existing stimuli, as well as an additional set of invertible phrase/non-phrase stimuli in 'stimuli/handpicked_phrase_new.csv`, to generate a final common set of stimuli for all subjects plus additional even/odd sub specific stimuli. The goal of the stimuli set is as follows:

- 180 common **non-word pair** stimuli for even/odd
- 90 common **phrase pair** stimuli for even/odd
- 90 common **non-invertible non-phrase pair** stimuli for even/odd
- 90 **phrase pair** stimuli for **even subjects**, which will also be **non-phrase pair** stimuli for the **odd subjects**
- 90 **non-phrase pair** stimuli for **even subjects**, which will also be **phrase pair** stimuli for the **odd subjects**

The 360 common stimuli will always be shown to all subjects, but the remaining 180 pairs will be invertible phrases that fill the role of non-phrase for half of subjects and phrase for the other half.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display

# Set seeds for reproducibility
RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)

# Load in the old and new stimuli. NB: The new stimuli are generated by 'additional_stimuli.ipynb'
datapath = Path("../../") / "stimuli"
newodd = pd.read_csv(datapath / "new_odd_two_word_stimuli.csv").set_index([
    "condition",
    "w1",
    "w2",
])
oldodd = (
    pd.read_csv(datapath / "odd_two_word_stimuli.csv", index_col=0)
    .set_index([
        "condition",
        "w1",
        "w2",
    ])
    .loc[["phrase", "non-word"], :, :]
)
neweven = pd.read_csv(datapath / "new_even_two_word_stimuli.csv").set_index([
    "condition",
    "w1",
    "w2",
])
oldeven = (
    pd.read_csv(datapath / "even_two_word_stimuli.csv", index_col=0)
    .set_index([
        "condition",
        "w1",
        "w2",
    ])
    .loc[["phrase", "non-word"], :, :]
)

# The new handpicked phrase stimuli
handpicked = pd.read_csv(datapath.parent / "word_ngrams/handpicked_phrase_new.csv")
handpicked["condition"] = "phrase"
handpicked["subtype"] = np.nan
handpicked["w1_type"] = "word"
handpicked["w2_type"] = "word"
handpicked["invertible"] = True
handpicked = handpicked.set_index(["condition", "w1", "w2"])
allidx = (
    oldeven.index.append(oldodd.index).append(neweven.index).append(newodd.index).drop_duplicates()
)
# display(handpicked.index.intersection(allidx))

# Mark which stimuli contain adjective-noun pairs which are "invertible", i.e. can be made into a
# phrase/non-phrase by swapping word order
newodd["invertible"] = False
neweven["invertible"] = False
oldodd["invertible"] = False
oldeven["invertible"] = False
neweven.loc["phrase", ["invertible"]] = True
newodd.loc["phrase", ["invertible"]] = True
oldodd.loc["phrase", ["invertible"]] = True
oldeven.loc["phrase", ["invertible"]] = True


allstim = (
    pd.concat([handpicked, newodd, neweven, oldodd, oldeven])
    .sort_index(level=["w1", "w2"])
    .sort_index(level="condition", ascending=False)
)
allstim.replace({"cons": "non-word", "noun": "word"}, inplace=True)
allstim = allstim.iloc[~allstim.index.duplicated()]

display(allstim.reset_index().value_counts("condition"))


# For the non-word stimuli, get the non-word part of the pair for balancing stimuli
def get_nw(row):
    match (row["condition"], row["w1_type"], row["w2_type"]):
        case ("non-word", "non-word", "word"):
            return row["w1"]
        case ("non-word", "word", "non-word"):
            return row["w2"]
        case ("phrase" | "non-phrase", _, _):
            return ""
        case (_, _, _):
            raise ValueError(f"Invalid stimulus: {row}")


nw_vals = allstim.reset_index().apply(get_nw, axis=1)
nw_vals.index = allstim.index
allstim["nw_ident"] = nw_vals

# Generate a list of all used words for generating an additional ~30 pairs of phrase stimuli. Also
# append a new column indicating how many times w1/w2 appears in the stimulus set.
meltdf = allstim.reset_index().melt(
    id_vars=["condition", "invertible", "w1_type", "w2_type"], value_vars=["w1", "w2"]
)
wordcounts = (
    meltdf.groupby("condition")
    .agg({"value": "value_counts"})
    .rename(columns={"value": "occurrences"})
)
nonword_mask = ~(
    ((meltdf["w1_type"] == "non-word") & (meltdf["variable"] == "w1"))
    | ((meltdf["w2_type"] == "non-word") & (meltdf["variable"] == "w2"))
)
# pnp_mask = meltdf["condition"].isin(["phrase", "non-phrase"])
# gooditems = meltdf.loc[nonword_mask & pnp_mask, "value"].unique()
gooditems = meltdf.loc[nonword_mask, "value"].unique()
baditems = wordcounts.index.get_level_values(1).difference(gooditems)
wordcounts.loc[(slice(None), baditems), "occurrences"] = 1


# For the non-word stimuli, get the non-word part of the pair for balancing stimuli
def get_count(row, word):
    match (row["condition"], word):
        case ("non-word", _):
            return 1
        case (_, "w1"):
            return wordcounts.at[(row["condition"], row["w1"]), "occurrences"]
        case (_, "w2"):
            return wordcounts.at[(row["condition"], row["w2"]), "occurrences"]


display(wordcounts.query("occurrences > 1"))
allstim["w1_count"] = allstim.reset_index().apply(lambda row: get_count(row, "w1"), axis=1).values
allstim["w2_count"] = allstim.reset_index().apply(lambda row: get_count(row, "w2"), axis=1).values

display(allstim.query("w1 == 'blue' or w2 == 'blue'"))
print(len(allstim) - len(allstim.reset_index().drop_duplicates()), "duplicates")
allstim.to_csv(datapath / "unified_stimulus_list.csv")


condition
phrase        266
non-word      207
non-phrase    197
Name: count, dtype: int64

Unnamed: 0_level_0,Unnamed: 1_level_0,occurrences
condition,value,Unnamed: 2_level_1
non-phrase,girl,3
non-phrase,sky,3
non-phrase,ahead,2
non-phrase,areas,2
non-phrase,best,2
...,...,...
phrase,walk,2
phrase,waves,2
phrase,weeks,2
phrase,wise,2


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,subtype,w1_type,w2_type,invertible,Unnamed: 0.1,index,Unnamed: 0,nw_ident,w1_count,w2_count
condition,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
phrase,deep,blue,,word,word,True,,,,,2,3
phrase,dark,blue,,word,word,True,,,,,2,3
phrase,blue,role,even,word,word,True,,,,,3,1
non-phrase,blue,deep,odd,word,word,False,495.0,45.0,,,2,2
non-phrase,blue,dark,even,word,word,False,495.0,45.0,,,2,2


0 duplicates


## Common stimulus selection 

#### Phrase (90 elements, odd and even subs, necessarily invertible)

We want a set of 90 phrase stimuli which are never inverted and always shown as syntactically valid phrases to all subjects. These will be sampled without replacement from the stimulus set.

In [140]:
candidates = allstim.copy()
nodup_cand = candidates.query("w1_count < 2 and w2_count < 2").copy()
dup_pairs = [
    candidates.reset_index()
    .query("condition == 'phrase'")
    .query("w1 == @dupword or w2 == @dupword")
    .set_index(["condition", "w1", "w2"])
    .copy()
    for cond, dupword in wordcounts[wordcounts["occurrences"] > 1].index
]
display(nodup_cand.groupby("condition").size())
display(len(dup_pairs))

common_phrase = nodup_cand.xs("phrase", level="condition", drop_level=False).sample(
    90, random_state=rng
)
display(common_phrase)
nodup_cand.drop(common_phrase.index, inplace=True)
display(common_phrase.reset_index().value_counts(["w1", "w2"]))

condition
non-phrase    149
non-word      216
phrase        162
dtype: int64

144

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,subtype,w1_type,w2_type,invertible,Unnamed: 0,nw_ident,w1_count,w2_count
condition,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
phrase,light,bulb,,word,word,True,,,1,1
phrase,wild,doll,odd,word,word,True,1.0,,1,1
phrase,ill,drum,odd,word,word,True,2.0,,1,1
phrase,bad,moon,odd,word,word,True,3.0,,1,1
phrase,get,help,,word,word,True,,,1,1
phrase,...,...,...,...,...,...,...,...,...,...
phrase,brain,stem,,word,word,True,,,1,1
phrase,red,eyes,,word,word,True,,,1,1
phrase,calm,omen,odd,word,word,True,87.0,,1,1
phrase,thin,word,odd,word,word,True,88.0,,1,1


w1     w2   
after  lunch    1
sick   bat      1
same   chin     1
sad    clue     1
rosy   sofa     1
               ..
full   egg      1
fresh  fruit    1
firm   mug      1
fine   arts     1
work   ethic    1
Name: count, Length: 90, dtype: int64

#### Non-invertible non-phrase stimuli (90 elements, even and odd subs)

Same logic as above.

In [141]:
common_ni_np = (
    nodup_cand.xs("non-phrase", level="condition", drop_level=False)
    .query("invertible == False")
    .sample(90, random_state=rng)
)
display(common_ni_np)
nodup_cand.drop(common_ni_np.index, inplace=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,subtype,w1_type,w2_type,invertible,Unnamed: 0,nw_ident,w1_count,w2_count
condition,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
non-phrase,fewer,poked,odd,word,word,False,102.0,,1,1
non-phrase,stony,wrote,odd,word,word,False,170.0,,1,1
non-phrase,boy,small,even,word,word,False,465.0,,1,1
non-phrase,shiny,prays,odd,word,word,False,176.0,,1,1
non-phrase,cow,hard,odd,word,word,False,483.0,,1,1
non-phrase,...,...,...,...,...,...,...,...,...,...
non-phrase,bowl,lame,odd,word,word,False,464.0,,1,1
non-phrase,lowly,waged,odd,word,word,False,172.0,,1,1
non-phrase,toad,apt,odd,word,word,False,460.0,,1,1
non-phrase,smile,faint,even,word,word,False,471.0,,1,1


#### Non-word stimuli (180 elements, odd and even subs)

Now we will select a set of 180 word/non-word pairs for the non-word condition. These will necessarily be non-invertible.

Note that we simply don't have enough non-word consonant clusters to use 180 unique non-words, so we will sample them evenly per consonant cluster (up to the limit possible), then backfill with randomly chosen remaining pairs.

In [142]:
common_nw = (
    nodup_cand.xs("non-word", level="condition", drop_level=False)
    .groupby("nw_ident")
    .sample(2, random_state=rng)
)
nodup_cand.drop(common_nw.index, inplace=True)
additional_nw = nodup_cand.xs("non-word", level="condition", drop_level=False).sample(
    180 - len(common_nw), random_state=rng
)
nodup_cand.drop(additional_nw.index, inplace=True)
common_nw = pd.concat([common_nw, additional_nw])
display(common_nw)
display(common_nw["nw_ident"].value_counts())
print(len(nodup_cand.xs("phrase", level="condition", drop_level=False)), "candidates left")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,subtype,w1_type,w2_type,invertible,Unnamed: 0,nw_ident,w1_count,w2_count
condition,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
non-word,sofa,bksj,odd,word,non-word,False,180.0,bksj,1,1
non-word,bksj,drum,even,non-word,word,False,,bksj,1,1
non-word,bkvj,fox,odd,non-word,word,False,182.0,bkvj,1,1
non-word,bkvj,fun,odd,non-word,word,False,183.0,bkvj,1,1
non-word,fkmv,mold,odd,non-word,word,False,184.0,fkmv,1,1
non-word,...,...,...,...,...,...,...,...,...,...
non-word,wjgk,oak,odd,non-word,word,False,,wjgk,1,1
non-word,sick,qvzk,odd,word,non-word,False,359.0,qvzk,1,1
non-word,fin,qkzm,odd,word,non-word,False,,qkzm,1,1
non-word,cell,sjkv,odd,word,non-word,False,,sjkv,1,1


nw_ident
wjgk    3
mkqv    3
qvtj    3
qvzk    3
sjkv    3
       ..
kvtj    2
kvjk    2
kkzj    2
kkwj    2
zmtk    2
Name: count, Length: 87, dtype: int64

72 candidates left


#### Phrase/Non-phrase swap pairs for even/odd subjects

Here we will select 90 **invertible phrase pairs** to be used as phrase for even subs, and non-phrase for odd subs, and an additional 90 pairs which will be used **in the opposite way**, i.e. phrase for odd subs and non-phrase for even subs.

In [143]:
print(nodup_cand.groupby("condition").size())
print(len(dup_pairs))

condition
non-phrase    59
non-word      36
phrase        72
dtype: int64
144


In [None]:
def sample_dups(i, j):
    try:
        return dup_pairs[i].reset_index().iloc[j]
    except IndexError:
        return dup_pairs[i].reset_index().iloc[0]


even_phrase = (
    nodup_cand.xs("phrase", level="condition", drop_level=False)
    .query("invertible == True")
    .sample(35, random_state=rng)
)
nodup_cand.drop(even_phrase.index, inplace=True)
dup_even_phrase = pd.DataFrame.from_records([sample_dups(i, 0) for i in range(55)])
dup_even_phrase.set_index(["condition", "w1", "w2"], inplace=True)
display(dup_even_phrase.size)
even_phrase = pd.concat((even_phrase, dup_even_phrase))

odd_phrase = (
    nodup_cand.xs("phrase", level="condition", drop_level=False)
    .query("invertible == True")
    .sample(35, random_state=rng)
)
candidates.drop(odd_phrase.index, inplace=True)
dup_odd_phrase = pd.DataFrame.from_records([sample_dups(i, 1) for i in range(55)])
dup_odd_phrase.set_index(["condition", "w1", "w2"], inplace=True)
display(dup_odd_phrase.size)
odd_phrase = pd.concat((odd_phrase, dup_odd_phrase))

display(even_phrase)
display(len(even_phrase))
display(odd_phrase)
display(len(odd_phrase))

440

440

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,subtype,w1_type,w2_type,invertible,Unnamed: 0,nw_ident,w1_count,w2_count
condition,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
phrase,have,heard,,word,word,True,,,1,1
phrase,dual,hall,odd,word,word,True,389.0,,1,1
phrase,avid,fig,even,word,word,True,361.0,,1,1
phrase,nice,lake,odd,word,word,True,386.0,,1,1
phrase,such,owl,even,word,word,True,389.0,,1,1
phrase,...,...,...,...,...,...,...,...,...,...
phrase,optic,nerve,,word,word,True,,,1,2
phrase,rare,cases,,word,word,True,,,2,2
phrase,low,risk,,word,word,True,,,2,2
phrase,shy,crab,odd,word,word,True,385.0,,1,1


90

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,subtype,w1_type,w2_type,invertible,Unnamed: 0,nw_ident,w1_count,w2_count
condition,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
phrase,any,lamp,odd,word,word,True,364.0,,1,1
phrase,this,lord,even,word,word,True,388.0,,1,1
phrase,shy,crab,odd,word,word,True,385.0,,1,1
phrase,data,sets,,word,word,True,,,1,1
phrase,set,sail,,word,word,True,,,1,1
phrase,...,...,...,...,...,...,...,...,...,...
phrase,optic,nerve,,word,word,True,,,1,2
phrase,rare,bell,odd,word,word,True,405.0,,2,1
phrase,high,risk,,word,word,True,,,2,2
phrase,shy,crab,odd,word,word,True,385.0,,1,1


90

## Stimulus list generation

Now we will generate the final stimulus lists for each set of subjects. Both should be $M_{\text{blocks}} \times N_{\text{conds}} \times K_{\text{cond stimuli}}$ long, i.e. $3 \times 3 \times 60 = 540$ elements.

Notably 90 phrase + 90 non-phrase stimuli will be generated using the even/odd specific pairs sampled above.

In [145]:
basecommon = pd.concat([common_phrase, common_ni_np, common_nw])
basecommon["subtype"] = np.nan
display(basecommon.reset_index().value_counts(["w1", "w2"]))

even_nonphrase = odd_phrase.copy().reset_index().rename(columns={"w1": "w2", "w2": "w1"})
even_nonphrase["condition"] = "non-phrase"
# even_nonphrase.set_index(["condition", "w1", "w2"], inplace=True)
odd_nonphrase = even_phrase.copy().reset_index().rename(columns={"w1": "w2", "w2": "w1"})
odd_nonphrase["condition"] = "non-phrase"
# odd_nonphrase.set_index(["condition", "w1", "w2"], inplace=True)

even_pnp = pd.concat([even_phrase.reset_index(), even_nonphrase])
even_pnp["subtype"] = "even"
print("even pnp:", len(even_pnp))

odd_pnp = pd.concat([odd_phrase.reset_index(), odd_nonphrase])
odd_pnp["subtype"] = "odd"
print("odd pnp:", len(odd_pnp))

odd_stimuli = pd.concat([basecommon.reset_index(), odd_pnp])
print("odd stimuli:", len(odd_stimuli))
display(len(odd_stimuli))
display(odd_stimuli.groupby("condition").size())

even_stimuli = pd.concat([basecommon.reset_index(), even_pnp])
print("even stimuli:", len(even_stimuli))
display(len(even_stimuli))
display(even_stimuli.groupby("condition").size())

even_stimuli["subtype"] = "even"
even_stimuli.reset_index().drop(columns=["nw_ident", "w1_count", "w2_count"]).to_csv(
    datapath / "new_even_two_word_stimuli.csv"
)

odd_stimuli["subtype"] = "odd"
odd_stimuli.reset_index().drop(columns=["nw_ident", "w1_count", "w2_count"]).to_csv(
    datapath / "new_odd_two_word_stimuli.csv"
)

w1     w2   
acute  beg      1
raw    zkqv     1
rabid  fills    1
qvzk   bud      1
qvxj   such     1
               ..
hours  five     1
hot    road     1
horn   mvzm     1
holy   rice     1
zmtk   real     1
Name: count, Length: 360, dtype: int64

even pnp: 180
odd pnp: 180
odd stimuli: 540


540

condition
non-phrase    180
non-word      180
phrase        180
dtype: int64

even stimuli: 540


540

condition
non-phrase    180
non-word      180
phrase        180
dtype: int64