In [35]:
import pandas as pd
import numpy as np
from pathlib import Path
import polars as pl

%matplotlib inline

RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)


# Load in Nina's final stimuli lists and the consonants
stimpath = Path("../../word_ngrams/").resolve()
pnp_stim_even = pd.read_csv(stimpath / "final_P_NP_evensubs.csv").drop(columns=["Unnamed: 0"])
pnp_stim_odd = pd.read_csv(stimpath / "final_P_NP_oddsubs.csv").drop(columns=["Unnamed: 0"])
pnp_stim = pd.concat([pnp_stim_even, pnp_stim_odd]).reset_index(drop=True)
cons = pd.read_csv(stimpath / "cons_clust_final_candidates.csv").set_index(["clus1", "clus2"])

# Load in the core list of all words, isolate by nouns
wordcats = pd.read_csv(stimpath / "1grams_english_1b_with_pos.csv").convert_dtypes()

# Isolate the words which are nouns in the top 45k
wordcats["word"] = wordcats["ngram"].str.split("_").str[0]
wordcats = wordcats[~wordcats["word"].str.isupper()].reset_index(drop=True)
wordcats["POS"] = wordcats["ngram"].str.split("_").str[1]
wordcats.drop(columns=["ngram"], inplace=True)
wordcats = wordcats.reindex(columns=["word", "POS", "freq"])
nouns = wordcats[wordcats["POS"] == "NOUN"].reset_index(drop=True)
nouns["wordlen"] = nouns["word"].str.len()
adjs = wordcats[wordcats["POS"] == "ADJ"].reset_index(drop=True)
adjs["wordlen"] = adjs["word"].str.len()

# Isolate the nouns not yet in the stimuli lists
pnp_stim_words = pd.melt(pnp_stim[["w1", "w2"]], ignore_index=False).reset_index(drop=False)
remidx = ~nouns["word"].isin(pnp_stim_words["value"])
adj_remidx = ~adjs["word"].isin(pnp_stim_words["value"])
candidates = nouns[remidx & nouns["wordlen"].isin((3, 4, 5))].reset_index(drop=True).copy()
adj_candidates = adjs[adj_remidx & adjs["wordlen"].isin((3, 4, 5))].reset_index(drop=True).copy()
print(
    f"A total of {len(candidates)} nouns remain to be added to the stimuli lists."
    f" {len(adj_candidates)} adjectives remain to be added to the stimuli lists."
)
print(candidates["word"].isin(pnp_stim_words["value"]).sum())

A total of 7130 nouns remain to be added to the stimuli lists. 1225 adjectives remain to be added to the stimuli lists.
0


In [None]:
from ipywidgets import interact_manual, RadioButtons


# Convert the candidates to a polars DataFrame for faster processing
noun_cand = pl.from_pandas(candidates["word"].str.lower())
adj_cand = pl.from_pandas(adj_candidates["word"].str.lower())
# Create a list of all possible combinations of nouns and adjectives
pairs = noun_cand.to_frame().join(adj_cand.to_frame(), how="cross")
pairs = pairs.rename({"word": "w1", "word_right": "w2"})
display(pairs.shape)


# Load 2grams
ngrams2 = pl.read_csv(stimpath / "2grams_english_1a_no_pos.csv")
ngrams2.with_columns(ngram=ngrams2["ngram"].str.to_lowercase())

q = pairs.lazy().filter(
    pl.concat_str(pl.col("w1"), pl.col("w2"), separator=" ").is_in(ngrams2["ngram"]),
    pl.concat_str(pl.col("w2"), pl.col("w1"), separator=" ").is_in(ngrams2["ngram"]).not_(),
)
goodcand = q.collect().to_pandas()
display(ngrams2)
display(goodcand)

goodcand.to_csv(stimpath / "addtl_stimuli_phrase_candidates.csv", index=False)
print(goodcand["w1"].isin(pnp_stim_words["value"]).sum())
print(goodcand["w2"].isin(pnp_stim_words["value"]).sum())


(8734250, 2)

w1
str
"""way other"""
"""way more"""
"""way first"""
"""way many"""
"""way own"""
…
"""way well"""
"""way such"""
"""way older"""
"""way moral"""


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  goodcand = q.collect().to_pandas()


ngram,freq
str,i64
"""of the""",1741529442
"""in the""",1043688187
"""to the""",686570630
"""on the""",446536709
"""and the""",446190715
…,…
"""cell adhesion""",187605
"""and defence""",187605
"""the testator""",187603
"""suppressed by""",187590


Unnamed: 0,w1,w2
0,way,more
1,way,past
2,way,more
3,way,back
4,way,round
...,...,...
7429,wrong,for
7430,wrong,way
7431,wrong,side
7432,wrong,with


1166
355
