In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import polars as pl

%matplotlib inline

RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)

REMOVE = (
    "was",
    "did",
    "has",
    "had",
    "vary",
    "that",
    "these",
    "those",
    "with",
    "him",
    "her",
    "them",
    "us",
    "me",
    "you",
    "he",
    "she",
    "it",
    "they",
    "we",
    "my",
    "your",
    "his",
    "her",
    "its",
    "our",
    "their",
    "out",
    "for",
    "they",
    "then",
    "yells",
)


# Load in Nina's final stimuli lists and the consonants
stimpath = Path("../../word_ngrams/").resolve()
pnp_stim_even = pd.read_csv(stimpath / "final_P_NP_evensubs.csv").drop(columns=["Unnamed: 0"])
pnp_stim_odd = pd.read_csv(stimpath / "final_P_NP_oddsubs.csv").drop(columns=["Unnamed: 0"])
pnp_stim = pd.concat([pnp_stim_even, pnp_stim_odd]).reset_index(drop=True)
cons = pd.read_csv(stimpath / "cons_clust_final_candidates.csv").set_index(["clus1", "clus2"])

# Load in the core list of all words, isolate by nouns
wordcats = pd.read_csv(stimpath / "1grams_english_1b_with_pos.csv").convert_dtypes()

# Isolate the words which are nouns in the top 45k
wordcats["word"] = wordcats["ngram"].str.split("_").str[0]
wordcats = wordcats[~wordcats["word"].str.isupper()].reset_index(drop=True)
wordcats["POS"] = wordcats["ngram"].str.split("_").str[1]
wordcats.drop(columns=["ngram"], inplace=True)
wordcats = wordcats.reindex(columns=["word", "POS", "freq"])
verbs = wordcats[wordcats["POS"] == "VERB"].reset_index(drop=True)
verbs["word"] = verbs["word"].str.encode("ascii", errors="ignore").str.decode("ascii")
verbs["wordlen"] = verbs["word"].str.len()

adjs = wordcats[wordcats["POS"] == "ADJ"].reset_index(drop=True)
adjs["word"] = adjs["word"].str.encode("ascii", errors="ignore").str.decode("ascii")
adjs["wordlen"] = adjs["word"].str.len()

wccopy = wordcats.copy()
wccopy["word"] = wccopy["word"].str.lower()
wccopy["wordlen"] = wccopy["word"].str.len()
wccopy = wccopy.loc[wccopy["wordlen"].isin((3, 4, 5))]
allverbs = wccopy.query("POS == 'VERB'")["word"].unique()
alladjs = wccopy.query("POS == 'ADJ'")["word"].unique()
ncats = wccopy.set_index("word").sort_index().groupby("word").nunique("POS")
goodverbs = ncats.loc[allverbs].query("POS == 1").reset_index(drop=False)["word"].unique()
goodadjs = ncats.loc[alladjs].query("POS == 1").reset_index(drop=False)["word"].unique()
verbs = verbs.loc[verbs["word"].isin(goodverbs)]
adjs = adjs.loc[adjs["word"].isin(goodadjs)]

# Isolate the nouns not yet in the stimuli lists
pnp_stim_words = pd.melt(pnp_stim[["w1", "w2"]], ignore_index=False).reset_index(drop=False)
remidx = ~verbs["word"].isin(pnp_stim_words["value"])
adj_remidx = ~adjs["word"].isin(pnp_stim_words["value"])
candidates = verbs[remidx & verbs["wordlen"].isin((3, 4, 5))].reset_index(drop=True).copy()
adj_candidates = adjs[adj_remidx & adjs["wordlen"].isin((3, 4, 5))].reset_index(drop=True).copy()

adj_candidates = adjs.reset_index(drop=True)["word"].sort_values().str.lower().copy()
adj_candidates = adj_candidates[
    ~adj_candidates.isin(REMOVE)
    & ~adj_candidates.isin(candidates)
    & adj_candidates.str.isalpha()
    & ~adj_candidates.isin(pnp_stim_words["value"])
    & adj_candidates.str.len().isin((3, 4, 5))
]

candidates = candidates.reset_index(drop=True)["word"].sort_values().str.lower().copy()
candidates = candidates[
    ~candidates.isin(REMOVE)
    & ~candidates.isin(adj_candidates)
    & candidates.str.isalpha()
    & ~candidates.isin(pnp_stim_words["value"])
    & candidates.str.len().isin((3, 4, 5))
]
print(
    f"A total of {len(candidates)} verbs remain to be added to the stimuli lists."
    f" {len(adj_candidates)} adjectives remain to be added to the stimuli lists."
)
print(candidates.isin(pnp_stim_words["value"]).sum())
display(adj_candidates)
display(candidates)
print(candidates.isin(REMOVE).sum())

A total of 457 verbs remain to be added to the stimuli lists. 250 adjectives remain to be added to the stimuli lists.
0


2       able
298    acrid
22     acute
159    adept
300     agro
       ...  
231    windy
229     wiry
158    wiser
182    witty
21     worst
Name: word, Length: 250, dtype: object

271    abide
218    ached
440    aches
113    acted
154    adapt
       ...  
87      wore
387     wove
220    woven
41     wrote
384    wrung
Name: word, Length: 457, dtype: object

0


In [29]:
# Convert the candidates to a polars DataFrame for faster processing

verb_cand = pl.from_pandas(candidates.str.lower().copy())
adj_cand = pl.from_pandas(adj_candidates.str.lower().copy())
# Create a list of all possible combinations of nouns and adjectives
pairs = adj_cand.to_frame().join(verb_cand.to_frame(), how="cross")
pairs = pairs.rename({"word": "w1", "word_right": "w2"})
display(pairs.shape)


# Load 2grams
ngrams2 = pl.read_csv(stimpath / "2grams_english_1a_no_pos.csv")
ngrams2.with_columns(ngram=ngrams2["ngram"].str.to_lowercase())

q = pairs.lazy().filter(
    pl.concat_str(pl.col("w1"), pl.col("w2"), separator=" ").is_in(ngrams2["ngram"]).not_(),
    pl.concat_str(pl.col("w2"), pl.col("w1"), separator=" ").is_in(ngrams2["ngram"]).not_(),
)
goodcand = q.collect().to_pandas()
display(goodcand.sort_values("w2"))

goodcand.to_csv(stimpath / "addtl_stimuli_phrase_candidates.csv", index=False)
print(goodcand["w1"].isin(pnp_stim_words["value"]).sum())
print(goodcand["w2"].isin(pnp_stim_words["value"]).sum())


(114250, 2)

Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  goodcand = q.collect().to_pandas()


Unnamed: 0,w1,w2
0,able,abide
58944,lunar,abide
59401,lurid,abide
59858,manic,abide
60315,manly,abide
...,...,...
63056,muddy,wrung
26042,eerie,wrung
12333,brisk,wrung
62599,molar,wrung


0
0


In [41]:
# Take the hand-chosen final phrase candidates
finalcand = pd.read_csv(stimpath / "handpicked_phrase_new.csv")
npfinal = pd.read_csv(stimpath / "handpicked_adjverb_phrase_new.csv")
wordcounts = finalcand.melt()["value"].value_counts()
freq_df = finalcand.replace(wordcounts)
# display(finalcand)
# display(finalcand.set_index("w2").sort_index())

# Randomly assign, except where the final word is duplicated: then split on the first word
rng = np.random.default_rng(42)
words_even = []
words_odd = []
pairs = finalcand.set_index("w2", append=True).sort_index()
ukcounts = pairs.index.get_level_values("w2").value_counts().sort_values(ascending=False)
ukeys = ukcounts.index.tolist()

for ukey in ukeys:
    keypairs = pairs.xs(ukey, level="w2")
    match (len(keypairs), len(words_even), len(words_odd)):
        case 1, 60, _:
            words_odd.append((keypairs.iloc[0]["w1"], ukey))
        case 1, _, 60:
            words_even.append((keypairs.iloc[0]["w1"], ukey))
        case 1, _, _:
            if rng.random() < 0.5:
                words_odd.append((keypairs.iloc[0]["w1"], ukey))
            else:
                words_even.append((keypairs.iloc[0]["w1"], ukey))
        case 2, _, _:
            idx1 = 1 if rng.random() < 0.5 else 0
            idx2 = 0 if idx1 == 1 else 1
            words_even.append((keypairs.iloc[idx1]["w1"], ukey))
            words_odd.append((keypairs.iloc[idx2]["w1"], ukey))
        case _, _, _:
            raise ValueError(f"Unexpected number of words: {len(keypairs)} for {ukey}")

evenstim = pd.DataFrame(np.array(words_even), columns=["w1", "w2"])
evenstim["condition"] = "phrase"
evenstim["subtype"] = "even"
evenstim["w1_type"] = "word"
evenstim["w2_type"] = "word"
oddstim = pd.DataFrame(np.array(words_odd), columns=["w1", "w2"])
oddstim["condition"] = "phrase"
oddstim["subtype"] = "odd"
oddstim["w1_type"] = "word"
oddstim["w2_type"] = "word"
# display(evenstim)
# display(oddstim)
# display(evenstim.melt()["value"].value_counts().head(4))
# display(oddstim.melt()["value"].value_counts().head(4))

npfinal["condition"] = "non-phrase"
npfinal["subtype"] = "even"
npfinal["w1_type"] = "word"
npfinal["w2_type"] = "word"

npfinalsamp = npfinal.sample(120).reset_index(drop=True)
npfinalsamp.loc[60:, "subtype"] = "odd"

evenstim = pd.concat([
    evenstim,
    npfinalsamp.iloc[:60],
]).reset_index(drop=True)
oddstim = pd.concat([
    oddstim,
    npfinalsamp.iloc[60:],
]).reset_index(drop=True)
display(oddstim)

allstim = pd.concat([
    evenstim.melt(id_vars=[], value_vars=["w1", "w2"]),
    oddstim.melt(id_vars=[], value_vars=["w1", "w2"]),
    pnp_stim_words,
])["value"]
nw_cand = verbs[~verbs["word"].str.lower().isin(allstim) & verbs["wordlen"].isin((3, 4, 5))]
nw_cand["word"] = nw_cand["word"].str.lower()

nwstim = pd.read_csv(stimpath / "final_nw_stimuli.csv", index_col=0).rename({
    "w1": "w2",
    "w2": "w1",
    "w1_len": "w2_len",
    "w2_len": "w1_len",
    "w1_type": "w2_type",
    "w2_type": "w1_type",
})
display(nw_cand)


  freq_df = finalcand.replace(wordcounts)


Unnamed: 0,w1,w2,condition,subtype,w1_type,w2_type
0,deep,blue,phrase,odd,word,word
1,iced,tea,phrase,odd,word,word
2,just,ahead,phrase,odd,word,word
3,eight,weeks,phrase,odd,word,word
4,worst,thing,phrase,odd,word,word
...,...,...,...,...,...,...
115,newer,cater,non-phrase,odd,word,word
116,gated,infer,non-phrase,odd,word,word
117,limp,fared,non-phrase,odd,word,word
118,inert,shuts,non-phrase,odd,word,word


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nw_cand["word"] = nw_cand["word"].str.lower()


Unnamed: 0,word,POS,freq,wordlen
1,was,VERB,1751648374,3
4,had,VERB,851806969,3
6,were,VERB,593516792,4
10,been,VERB,419558529,4
12,has,VERB,394883282,3
...,...,...,...,...
8112,boxed,VERB,214275,5
8197,savor,VERB,210186,5
8201,clap,VERB,210116,4
8204,glean,VERB,209811,5


In [43]:
even_df = pd.read_csv(stimpath.parent / "stimuli" / "new_even_two_word_stimuli.csv")
odd_df = pd.read_csv(stimpath.parent / "stimuli" / "new_odd_two_word_stimuli.csv")
even_df.loc[60:119] = evenstim.loc[60:119]
odd_df.loc[60:119] = oddstim.loc[60:119]

even_df.to_csv(stimpath.parent / "stimuli" / "new_even_two_word_stimuli.csv", index=False)
odd_df.to_csv(stimpath.parent / "stimuli" / "new_odd_two_word_stimuli.csv", index=False)
