In [9]:
# pip install nltk pandas
import pandas as pd
import random
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('punkt_tab')   
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mberronesreyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mberronesreyes/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mberronesreyes/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/mberronesreyes/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mberronesreyes/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:

df = pd.read_csv("dreamlike_prompts.csv")


def get_random_prompt(length):
    if length == "short":
        col = "short_prompt"
    elif length == "mid":
        col = "mid_prompt"
    elif length == "long":
        col = "long_prompt"
    else:
        raise ValueError("Length must be 'short', 'mid', or 'long'.")
    
    prompt = df[col].dropna().sample(1).values[0]
    return prompt

def extract_keywords(prompt, max_words=5):
    words = nltk.word_tokenize(prompt)
    tagged = nltk.pos_tag(words)

    # POS categories 
    #https://www.learntek.org/blog/categorizing-pos-tagging-nltk-python/
    categories = {
        "NOUN": ["NN", "NNS", "NNP", "NNPS"],
        "VERB": ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
        "ADJ": ["JJ", "JJR", "JJS"],
        "ADV": ["RB", "RBR", "RBS"]
    }

    # Store candidates per category
    candidates = {cat: [] for cat in categories}

    for w, pos in tagged:
        w_clean = w.lower()
        if w_clean.isalpha() and w_clean not in stop_words:  # only keep words, no punctuation or fillers
            for cat, tags in categories.items():
                if pos in tags:
                    candidates[cat].append(w_clean)

    # Prioritize: NOUNS first, then VERBS, then ADJ/ADV
    chosen = []
    for cat in ["NOUN", "VERB", "ADJ", "ADV"]:
        if candidates[cat]:
            chosen.append(random.choice(candidates[cat]))
        if len(chosen) >= max_words:
            break

    # If fewer than 3 found, pad with additional randoms from any category
    all_candidates = sum(candidates.values(), [])
    while len(chosen) < 3 and all_candidates:
        extra = random.choice(all_candidates)
        if extra not in chosen:
            chosen.append(extra)

    return chosen

def find_prompt_with_words(words, df, min_matches=2):
    for prompt in df["long_prompt"].dropna():
        count = sum(1 for w in words if w.lower() in prompt.lower())
        if count >= min_matches:
            return prompt
    return None



In [12]:
# Pick short, mid, long
prompt = get_random_prompt("mid")
print("Selected prompt:", prompt)

keywords = extract_keywords(prompt)
print("Extracted keywords:", keywords)

match = find_prompt_with_words(keywords, df)
print("Matching prompt:", match if match else "No match found")

Selected prompt: The path was covered in mirrors, and each reflection was of a different version of me.
Extracted keywords: ['path', 'covered', 'different']
Matching prompt: The ground beneath me was covered in sand, but the grains sparkled like tiny mirrors, each one reflecting a different version of myself.
