In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import polars as pl

%matplotlib inline

RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)

# Load in Nina's final stimuli lists and the consonants
stimpath = Path("../../word_ngrams/").resolve()
pnp_stim_even = pd.read_csv(stimpath / "final_P_NP_evensubs.csv").drop(columns=["Unnamed: 0"])
pnp_stim_odd = pd.read_csv(stimpath / "final_P_NP_oddsubs.csv").drop(columns=["Unnamed: 0"])
pnp_stim = pd.concat([pnp_stim_even, pnp_stim_odd]).reset_index(drop=True)
cons = pd.read_csv(stimpath / "cons_clust_final_candidates.csv").set_index(["clus1", "clus2"])

# Load in the core list of all words
wordcats = pd.read_csv(stimpath / "1grams_english_1b_with_pos.csv").convert_dtypes()
# Remove words which can be multiple different parts of sentence. Since POS markers are separated
# by _ characters, simply counting those is enough to find which fit the bill
wordcats["ngram"] = wordcats["ngram"].str.encode("ascii", errors="replace").str.decode("ascii")
wordcats = wordcats[
    (wordcats["ngram"].str.count("_") < 2) & (wordcats["ngram"].str.count("\?") == 0)
]

# Isolate the words which are nouns in the top 45k
wordcats["word"] = wordcats["ngram"].str.split("_").str[0]
wordcats = wordcats[~wordcats["word"].str.isupper()].reset_index(drop=True)
wordcats["POS"] = wordcats["ngram"].str.split("_").str[1]
wordcats.drop(columns=["ngram"], inplace=True)
nouns = wordcats[wordcats["POS"] == "NOUN"].reset_index(drop=True)
nouns["wordlen"] = nouns["word"].str.len()
nouns["word"] = nouns["word"].str.encode("ascii", errors="ignore").str.decode("ascii")

adjs = wordcats[wordcats["POS"] == "ADJ"].reset_index(drop=True)
adjs["wordlen"] = adjs["word"].str.len()
adjs["word"] = adjs["word"].str.encode("ascii", errors="ignore").str.decode("ascii")

# Isolate the nouns not yet in the stimuli lists
pnp_stim_words = pd.melt(pnp_stim[["w1", "w2"]], ignore_index=False).reset_index(drop=False)
candidates = nouns[nouns["wordlen"].isin((3, 4, 5))].reset_index(drop=True).copy()
adj_candidates = adjs[adjs["wordlen"].isin((3, 4, 5))].reset_index(drop=True).copy()

adj_candidates = adjs.reset_index(drop=True)["word"].sort_values().str.lower().copy()
adj_candidates = adj_candidates[
    ~adj_candidates.isin(candidates)
    & ~adj_candidates.isin(pnp_stim_words["value"])
    & adj_candidates.str.len().isin((3, 4, 5))
]

candidates = candidates.reset_index(drop=True)["word"].sort_values().str.lower().copy()
candidates = candidates[
    ~candidates.isin(adj_candidates)
    & ~candidates.isin(pnp_stim_words["value"])
    & candidates.str.len().isin((3, 4, 5))
]
print(
    f"A total of {len(candidates)} nouns remain to be added to the stimuli lists."
    f" {len(adj_candidates)} adjectives remain to be added to the stimuli lists."
)
print(candidates.isin(pnp_stim_words["value"]).sum())

A total of 6029 nouns remain to be added to the stimuli lists. 1160 adjectives remain to be added to the stimuli lists.
0


In [60]:
import ipywidgets as widg
from functools import partial

randidx = rng.permutation(np.arange(len(candidates)))
idx = 0
nouncand = candidates.copy()
goodnouns = []
textbox = widg.Text(value="You shouldnt see this", description="Noun to save", disabled=False)
textbox.value = nouncand.iloc[randidx[idx]]


def goodopt(_, goodwords: list, cands: pd.DataFrame):
    global idx
    goodwords.append(textbox.value)
    idx += 1
    textbox.value = cands.iloc[randidx[idx]]
    return


def badopt(_, cands: pd.DataFrame):
    global idx
    idx += 1
    textbox.value = cands.iloc[randidx[idx]]
    return


def savegood(_, goodwords, pos_name):
    pd.Series(goodwords).str.strip().drop_duplicates().to_csv(
        stimpath / f"good_{pos_name}.csv", index=False
    )
    return


def loadgood(_, goodwords, pos_name):
    goodser = pd.read_csv(stimpath / f"good_{pos_name}.csv")
    keepitems = ~goodser.isin(goodwords)
    goodwords.extend(goodser[keepitems]["0"])
    return


goodbutton = widg.Button(description="Good")
goodbutton.on_click(partial(goodopt, goodwords=goodnouns, cands=nouncand))
badbutton = widg.Button(description="Bad")
badbutton.on_click(partial(badopt, cands=nouncand))
savebutton = widg.Button(description="Save")
savebutton.on_click(partial(savegood, goodwords=goodnouns, pos_name="nouns"))
loadbutton = widg.Button(description="Load")
loadbutton.on_click(partial(loadgood, goodwords=goodnouns, pos_name="nouns"))
display(textbox, goodbutton, badbutton, savebutton, loadbutton)

Text(value='mhz', description='Noun to save')

Button(description='Good', style=ButtonStyle())

Button(description='Bad', style=ButtonStyle())

Button(description='Save', style=ButtonStyle())

Button(description='Load', style=ButtonStyle())

In [61]:
idx = 0
goodadj = []
adjcand = adj_candidates.copy()
randidx = rng.permutation(np.arange(len(adjcand)))
textbox = widg.Text(value="You shouldnt see this", description="Adj to save", disabled=False)
textbox.value = adjcand.iloc[randidx[idx]]

goodbutton = widg.Button(description="Good")
goodbutton.on_click(partial(goodopt, goodwords=goodadj, cands=adjcand))
badbutton = widg.Button(description="Bad")
badbutton.on_click(partial(badopt, cands=adjcand))
savebutton = widg.Button(description="Save")
savebutton.on_click(partial(savegood, goodwords=goodadj, pos_name="adjs"))
loadbutton = widg.Button(description="Load")
loadbutton.on_click(partial(loadgood, goodwords=goodadj, pos_name="adjs"))
display(textbox, goodbutton, badbutton, savebutton, loadbutton)

Text(value='state', description='Adj to save')

Button(description='Good', style=ButtonStyle())

Button(description='Bad', style=ButtonStyle())

Button(description='Save', style=ButtonStyle())

Button(description='Load', style=ButtonStyle())

In [63]:
goodadj

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'weird',
 'swiss',
 'toned',
 'iraqi',
