# Personas

1. Family Traveler – Kid-friendly attractions, safety-focused.
1. Backpacker – Budget-focused, immersive experiences.
1. Culture Explorer – Heritage sites, museums, local history.
1. Shopper – Malls, street markets, retail focus.
1. Foodie – Culinary experiences, hawker centers, local dishes.
1. Solo Traveler – Independent, flexible, self-guided.
1. Adventure Seeker – Thrill-based activities, outdoor sports.
1. Relaxation Seeker – Beaches, spas, slow-paced itinerary.
1. Luxury Seeker – High-end stays, exclusive experiences.
1. Digital Nomad – Remote work setup, co-working, long stays.


# Datasets

## Load and Normalize Seed Data

In [52]:
import pandas as pd

df_seed = pd.read_csv("./persona_data/persona_seeds.csv")
# duplicate_queries_boolean = df.duplicated(subset=['user_query'])
# # print(duplicate_queries_boolean.head())
# num_duplicate_queries = df.duplicated(subset=['user_query']).sum()
# print(f"\nNumber of duplicate user_queries: {num_duplicate_queries}")
# print(f"\ndf shape: {df.shape}")

# Ensure persona is always treated as a list
df_seed['persona'] = df_seed['persona'].apply(lambda x: x.split(';') if isinstance(x, str) else [])
print(df_seed.head())
print(df_seed.tail())

        persona                                         user_query
0  [Backpacker]  I'm a student backpacker on a very tight budge...
1  [Backpacker]  As a budget traveler, what are the best free w...
2  [Backpacker]  My priority is to save money on food. Can you ...
3  [Backpacker]  I travel light with just a backpack. Where can...
4  [Backpacker]  I'm looking for tips on hitchhiking or afforda...
                                               persona  \
183         [Luxury Seeker, Backpacker, Digital Nomad]   
184  [Luxury Seeker, Relaxation Seeker, Family Trav...   
185                 [Solo Traveler, Relaxation Seeker]   
186                   [Family Traveler, Digital Nomad]   
187                         [Adventure Seeker, Foodie]   

                                            user_query  
183  I'm looking for upscale experiences or five-st...  
184  I'm looking for upscale experiences or five-st...  
185  I'm hoping to unwind in peaceful, quiet places...  
186  I'm in need of q

## Generate Seed Embeddings

In [54]:
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("all-MiniLM-L6-v2")
# df_seed['embedding'] = df_seed['user_query'].apply(lambda x: model.encode(x, convert_to_numpy=True))

model = SentenceTransformer("all-mpnet-base-v2")
df_seed['embedding'] = df_seed['user_query'].apply(lambda x: model.encode(x, normalize_embeddings=True, convert_to_numpy=True))

print(df_seed['embedding'])


0      [0.0016342936, -0.052199643, -0.012730876, 0.0...
1      [-0.082870774, 0.049224664, 0.01095861, 0.0258...
2      [-0.02640347, 0.010736059, 0.016417222, -0.018...
3      [-0.024907874, 0.0013529228, -0.014126667, -0....
4      [0.023160018, -0.026743403, 0.039732616, -0.05...
                             ...                        
183    [-0.017193792, 0.020671131, -0.012779137, 0.01...
184    [-0.03490231, 0.0182993, -0.02636526, 0.060265...
185    [-0.05026815, 0.036296103, -0.020297699, -0.01...
186    [-0.060648777, -0.019467078, -0.006435667, -0....
187    [0.01579509, 0.04695996, -0.033329744, -0.0242...
Name: embedding, Length: 188, dtype: object


## Get all persona labels

In [55]:
df_persona_flatten = df_seed.explode('persona').dropna(subset=['persona'])
persona_labels = sorted(df_persona_flatten['persona'].unique())
print(persona_labels)

['Adventure Seeker', 'Backpacker', 'Culture Explorer', 'Digital Nomad', 'Family Traveler', 'Foodie', 'Luxury Seeker', 'Relaxation Seeker', 'Shopper', 'Solo Traveler']


## Compute Centroids Per Persona

In [56]:
import numpy as np

centroids = []
for label in persona_labels:
    embeddings = df_persona_flatten[df_persona_flatten['persona'] == label]['embedding'].tolist()
    centroid = np.mean(embeddings, axis=0)
    centroids.append(centroid)

centroids_array = np.array(centroids)
print(centroids_array)

[[-0.02430179  0.0005639  -0.01379317 ...  0.00805121  0.03548119
  -0.00894321]
 [-0.02147701 -0.00371314  0.00234409 ...  0.0004143   0.03398336
  -0.01226141]
 [-0.02807685  0.01211919 -0.01640119 ...  0.02340208  0.0293539
  -0.02225329]
 ...
 [-0.03362343 -0.00185131 -0.00870396 ... -0.00705461  0.03299256
  -0.00495257]
 [-0.01561555 -0.00794585 -0.02174257 ... -0.00251458  0.01844303
  -0.01669079]
 [-0.02447946  0.00351482 -0.00845103 ... -0.01150169  0.04608477
  -0.01147416]]


## Embed Unlabeled Queries

In [59]:
# df_unlabeled1 = pd.read_csv("./persona_data/unlabeled_travel_persona_1000_synthetic.csv")
# df_unlabeled1 = df_unlabeled1.rename(columns={"query": "user_query"})
# df_unlabeled2 = pd.read_csv("./persona_data/unlabeled_travel_persona_queries_20k.csv")
# df_unlabeled3 = pd.read_csv("./persona_data/unlabeled_queries_singapore_tourist.csv")
# df_unlabeled4 = pd.read_csv("./persona_data/unlabeled_travel_persona_dataset_full.csv")
# df_unlabeled4 = df_unlabeled4.rename(columns={"query": "user_query"})
# # Add blank persona column
# for df in [df_unlabeled1, df_unlabeled2, df_unlabeled3, df_unlabeled4]:
#     df["persona"] = ""
# # Merge all data
# df_unlabeled_merged = pd.concat([df_unlabeled1, df_unlabeled2, df_unlabeled3, df_unlabeled4], ignore_index=True)
# # Ensure column order
# df_unlabeled_merged = df_unlabeled_merged[["user_query", "persona"]]

# # # Save to CSV
# df_unlabeled_merged.to_csv("./persona_data/unlabeled_merged.csv", index=False)
df_unlabeled_merged = pd.read_csv("./persona_data/unlabeled_merged.csv")

def persona_label_duplicates(df):
    before_drop = len(df)
    print(f"\nRows before drop_duplicates: {before_drop}")
    # Drop duplicates
    df.drop_duplicates(subset=["user_query", "persona"], inplace=True)
    # Count after drop
    after_drop = len(df)
    print(f"Duplicates removed: {before_drop - after_drop}")
    print(f"Rows after drop_duplicates: {after_drop}")

persona_label_duplicates(df_unlabeled_merged)
unlabeled_queries = df_unlabeled_merged['user_query'].to_list()
print(f"Number of unlabeled queries: {len(unlabeled_queries)}")

# unlabeled_embeddings = model.encode(unlabeled_queries, convert_to_numpy=True)
unlabeled_embeddings = model.encode(unlabeled_queries, normalize_embeddings=True, convert_to_numpy=True)
unlabeled_embeddings = np.array(unlabeled_embeddings, dtype=np.float32)
print("Embeddings:", unlabeled_embeddings.shape)  # (3, 384)



Rows before drop_duplicates: 31240
Duplicates removed: 24668
Rows after drop_duplicates: 6572
Number of unlabeled queries: 6572
Embeddings: (6572, 768)


## Cosine Similarity based multi-labeling

In [61]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(unlabeled_embeddings, centroids_array)

threshold = 0.6  # Tune this
multi_labels = []
for row in similarity:
    indices = [i for i, score in enumerate(row) if score >= threshold]
    labels = [persona_labels[i] for i in indices]
    multi_labels.append(labels)

print(multi_labels)

df_labeled = pd.DataFrame({
    'user_query': unlabeled_queries,
    'persona': [';'.join(labels) for labels in multi_labels]
})

def persona_count(x):
    return len([p.strip() for p in x.split(';') if p.strip()])

def person_distribution(df_labeled):
    # Count label distribution
    label_distribution = df_labeled['label_count'].value_counts().sort_index()
    for count, num_rows in label_distribution.items():
        print(f"{count} labels: {num_rows} rows")

print("\nPersona labels distribution in dataset:", df_labeled.shape)
df_labeled['label_count'] = df_labeled['persona'].apply(persona_count)
person_distribution(df_labeled)

# Step 4: Filter into two groups
min_label, max_label = 1, 4
df_labeled_cosine_sim = df_labeled[(df_labeled['label_count'] >= min_label) & (df_labeled['label_count'] <= max_label)].copy()
df_labeled_rule_based = df_labeled[(df_labeled['label_count'] == 0) | (df_labeled['label_count'] > max_label)].copy()

print(f"Cosine-based data: {len(df_labeled_cosine_sim)} rows")
print(f"Rule-based fallback: {len(df_labeled_rule_based)} rows")

print("\nCosine-based persona labels distribution:", df_labeled_cosine_sim.shape)
df_labeled_cosine_sim['label_count'] = df_labeled_cosine_sim['persona'].apply(persona_count)
person_distribution(df_labeled_cosine_sim)

df_labeled_cosine_sim = df_labeled_cosine_sim.drop(columns=["label_count"])
df_labeled_cosine_sim.to_csv("./persona_data/labeled_persona_cosine_sim.csv", index=False)


[['Culture Explorer', 'Shopper'], ['Adventure Seeker', 'Backpacker', 'Culture Explorer', 'Family Traveler', 'Foodie', 'Shopper'], ['Adventure Seeker', 'Backpacker', 'Culture Explorer', 'Family Traveler', 'Foodie', 'Shopper'], ['Adventure Seeker', 'Backpacker', 'Culture Explorer', 'Family Traveler', 'Foodie', 'Luxury Seeker', 'Relaxation Seeker', 'Shopper', 'Solo Traveler'], ['Adventure Seeker', 'Backpacker', 'Culture Explorer', 'Family Traveler', 'Foodie', 'Luxury Seeker', 'Relaxation Seeker', 'Shopper', 'Solo Traveler'], ['Culture Explorer', 'Shopper'], ['Adventure Seeker', 'Backpacker', 'Culture Explorer', 'Family Traveler', 'Foodie', 'Shopper'], ['Adventure Seeker', 'Backpacker', 'Culture Explorer', 'Family Traveler', 'Foodie', 'Shopper', 'Solo Traveler'], ['Adventure Seeker', 'Culture Explorer', 'Luxury Seeker', 'Shopper'], ['Adventure Seeker', 'Backpacker', 'Culture Explorer', 'Family Traveler', 'Foodie', 'Relaxation Seeker', 'Shopper', 'Solo Traveler'], ['Adventure Seeker', 'Back

## Rule-Based multi-Labeling by person keywords matching


In [63]:
import pandas as pd
import spacy
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')      # for tokenization
nltk.download('wordnet')    # for lemmatization (optional)
stemmer = PorterStemmer()
print(stemmer.stem("exploring"))  # Test a word

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Normalize a sentence: lemmatize + stem + lowercase
def normalize(text):
    doc = nlp(text.lower())
    lemmatized = [token.lemma_ for token in doc if token.is_alpha]
    stemmed = [stemmer.stem(word) for word in lemmatized]
    return set(stemmed)

persona_keywords = {
    "Adventure Seeker": [
        "adventure", "hike", "trek", "kayak", "climb", "zipline", "wild", "outdoor", "explore", "trail","extreme", "bungee", "nature park", "adrenaline", "caving"
    ],
    "Backpacker": [
        "backpack", "hostel", "cheap stay", "budget lodging", "roam", "wander", "solo trip", "explore on foot", "minimal", "dorm", "guesthouse", "shoe-string"
    ],
    "Culture Explorer": [
        "museum", "heritage", "gallery", "temple", "historic", "culture", "tradition", "exhibit", "landmark", "cultural", "architecture", "ritual", "ancient", "old town"
    ],
    "Digital Nomad": [
        "work remotely", "coworking", "wifi", "laptop", "digital nomad", "remote work", "cafe with wifi", "stay connected", "online meeting", "workspace", "vpn", "hotspot", "zoom"
    ],
    "Family Traveler": [
        "kids", "children", "family", "baby", "parent", "toddler", "child-friendly", "family-friendly", "theme park", "zoo", "aquarium", "stroller", "activities for children"
    ],
    "Foodie": [
        "eat", "food", "cuisine", "dish", "try food", "hawker", "delicacy", "restaurant", "makan", "dining", "Michelin", "local flavor", "taste", "must eat", "buffet"
    ],
    "Luxury Seeker": [
        "luxury", "spa", "five-star", "resort", "fine dining", "high-end", "upscale", "boutique hotel","premium", "exclusive", "private", "chauffeur", "high-class", "VIP", "opulent"
    ],
    "Relaxation Seeker": [
        "relax", "chill", "unwind", "peaceful", "spa", "retreat", "meditation", "quiet", "beach", "leisure", "serene", "slow pace", "recharge", "massage"
    ],
    "Shopper": [
        "shop", "buy", "shopping", "souvenir", "mall", "boutique", "brands", "sale", "fashion", "luxury goods", "buy", "orchard", "bugis", "retail", "window shopping"
    ],
    "Solo Traveler": [
        "solo", "alone", "by myself", "myself", "me time", "independent", "travel alone", "explore on my own", "unaccompanied", "individual trip", "personal journey"
    ]
}
# Preprocess keywords for each persona
persona_normalized_keywords = {
    persona: set()
    for persona in persona_keywords
}
for persona, keywords in persona_keywords.items():
    for phrase in keywords:
        persona_normalized_keywords[persona].update(normalize(phrase))

print("Normalized persona keywords", persona_normalized_keywords)

# Function to normalize and match query
def match_personas_by_normalized_keywords(query):
    query_stems = normalize(query)
    matched = []

    for persona, stemmed_keywords in persona_normalized_keywords.items():
        if query_stems & stemmed_keywords:  # Any overlap
            matched.append(persona)

    return ";".join(matched)

# def match_personas_by_lemma_stem(query):
#     doc = nlp(query.lower())
#     query_tokens = set(stemmer.stem(token.lemma_) for token in doc if token.is_alpha)

#     matched = []
#     for persona, stemmed_keywords in persona_normalized_keywords.items():
#         if query_tokens & stemmed_keywords:
#             matched.append(persona)
#     return ";".join(matched)

# Apply rule-based persona assignment to all rows in df_labeled_rule_based
# df_labeled_rule_based['before_persona'] = df_labeled_rule_based['persona']  # Backup
df_labeled_rule_based['persona'] = df_labeled_rule_based['user_query'].apply(match_personas_by_normalized_keywords)
df_labeled_rule_based['label_count'] = df_labeled_rule_based['persona'].apply(persona_count)
print("\nRule-based persona labels distribution:", df_labeled_rule_based.shape)
person_distribution(df_labeled_rule_based)

min_label, max_label = 1, 4
# Drop rows where label_count is 0 or more than 4
df_labeled_rule_based = df_labeled_rule_based[(df_labeled_rule_based['label_count'] >= min_label) & (df_labeled_rule_based['label_count'] <= max_label)].copy()
print(f"\nRule-based persona data set: only keep data with personas in range [{min_label},{max_label}]")

# Drop the helper label_count column before saving
df_labeled_rule_based.drop(columns=['label_count'], inplace=True)

# Save
df_labeled_rule_based.to_csv("./persona_data/labeled_persona_rule_based.csv", index=False)



[nltk_data] Downloading package punkt to /Users/dracbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dracbook/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


explor
Normalized persona keywords {'Adventure Seeker': {'outdoor', 'park', 'trail', 'climb', 'adrenalin', 'bunge', 'natur', 'wild', 'kayak', 'adventur', 'hike', 'cave', 'ziplin', 'extrem', 'trek', 'explor'}, 'Backpacker': {'backpack', 'solo', 'minim', 'hostel', 'budget', 'lodg', 'guesthous', 'shoe', 'wander', 'string', 'trip', 'on', 'roam', 'foot', 'stay', 'cheap', 'dorm', 'explor'}, 'Culture Explorer': {'tradit', 'landmark', 'exhibit', 'architectur', 'ancient', 'heritag', 'cultur', 'town', 'old', 'galleri', 'histor', 'ritual', 'templ', 'museum'}, 'Digital Nomad': {'work', 'hotspot', 'digit', 'remot', 'meet', 'onlin', 'workspac', 'with', 'connect', 'zoom', 'cafe', 'wifi', 'stay', 'nomad', 'laptop', 'vpn', 'cowork'}, 'Family Traveler': {'park', 'activ', 'friendli', 'child', 'parent', 'aquarium', 'toddler', 'babi', 'stroller', 'famili', 'theme', 'for', 'kid', 'zoo'}, 'Foodie': {'michelin', 'cuisin', 'must', 'dine', 'buffet', 'restaur', 'delicaci', 'local', 'flavor', 'hawker', 'tast', 'd

# Train model

## Load train dataset

In [68]:
# df_1 = pd.read_csv("./persona_data/labeled_persona_dataset.csv")
# df_2 = pd.read_csv("./persona_data/labeled_persona_cosine_sim.csv")
# df_3 = pd.read_csv("./persona_data/labeled_persona_rule_based.csv")

# # Concatenate them into a single DataFrame
# df = pd.concat([df_1, df_2, df_3], ignore_index=True)

# # Drop duplicates, if any
# persona_label_duplicates(df)
# # df.drop_duplicates(subset=["user_query", "persona"], inplace=True)

# # Save to a new CSV for training
# df.to_csv("./persona_data/final_train_dataset.csv", index=False)

df = pd.read_csv("./persona_data/final_train_dataset.csv")

print(df.shape)
print(df.head())

from sklearn.preprocessing import MultiLabelBinarizer

# Split and clean multi-label string into list
label_lists = df['persona'].astype(str).apply(lambda x: [label.strip() for label in x.split(';') if label.strip()]).tolist()

# Binarize the labels
mlb = MultiLabelBinarizer()
label_matrix = mlb.fit_transform(label_lists)
print("Shape of label matrix:", label_matrix.shape)
print("Sample multi-hot encoded labels:\n", label_matrix[:5])

# Save the classes
label_classes = mlb.classes_
print("Multi-label classes:", label_classes)
# Save it after fitting
import joblib
joblib.dump(mlb, "./bert_multilabel_persona/label_encoder.bin")

(6896, 2)
                                          user_query  \
0  Planning to explore top shopping malls and loc...   
1  Seeking premium experiences like rooftop bars,...   
2  Interested in solo-friendly and safe spots to ...   
3  Planning to explore top shopping malls and loc...   
4  Seeking premium experiences like rooftop bars,...   

                               persona  
0              Family Traveler;Shopper  
1      Backpacker;Foodie;Luxury Seeker  
2                        Solo Traveler  
3             Adventure Seeker;Shopper  
4  Shopper;Solo Traveler;Luxury Seeker  
Shape of label matrix: (6896, 10)
Sample multi-hot encoded labels:
 [[0 0 0 0 1 0 0 0 1 0]
 [0 1 0 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 1 1]]
Multi-label classes: ['Adventure Seeker' 'Backpacker' 'Culture Explorer' 'Digital Nomad'
 'Family Traveler' 'Foodie' 'Luxury Seeker' 'Relaxation Seeker' 'Shopper'
 'Solo Traveler']


['./bert_multilabel_persona/label_encoder.bin']

## Train

In [65]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class PersonaDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [66]:
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, BertConfig

params = {
    'max_len': 128,
    'test_size': 0.2,
    'random_state': 42,
    'dropout': 0.3,
    'attention_dropout': 0.3,
}

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
# Prepare data
texts = df["user_query"].tolist()
labels = label_matrix  # shape: (n_samples, n_labels)

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=params['test_size'], random_state=params['random_state'])
# Dataset
train_dataset = PersonaDataset(texts, labels, tokenizer, params['max_len'])
val_dataset = PersonaDataset(val_texts, val_labels, tokenizer, params['max_len'])

# Config and Model
num_labels = labels.shape[1]
config = BertConfig.from_pretrained(
    "bert-base-uncased", 
    num_labels=num_labels, 
    problem_type="multi_label_classification", 
    hidden_dropout_prob=params['dropout'], 
    attention_probs_dropout_prob=params['attention_dropout'])

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

training_args = TrainingArguments(
    output_dir="./bert_multilabel_persona",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=1e-5,
    weight_decay=1e-5,        
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

def compute_metrics(pred):
    logits, labels = pred
    preds = (logits > 0).astype(int)  # Multi-label threshold
    return {
        'f1': f1_score(labels, preds, average='micro'),
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='micro'),
        'recall': recall_score(labels, preds, average='micro')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.4412,0.403408,0.494186,0.150725,0.827715,0.352247
2,0.3363,0.308651,0.711235,0.30942,0.851868,0.610456
3,0.2835,0.260074,0.785854,0.452899,0.887952,0.704814
4,0.2552,0.23409,0.821453,0.514493,0.898222,0.756774
5,0.2509,0.225719,0.837401,0.558696,0.908616,0.776538




TrainOutput(global_step=2155, training_loss=0.3369696331134805, metrics={'train_runtime': 800.1044, 'train_samples_per_second': 43.094, 'train_steps_per_second': 2.693, 'total_flos': 2268180206100480.0, 'train_loss': 0.3369696331134805, 'epoch': 5.0})

# Inference

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import joblib

# Load the tokenizer exactly the same as training
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("./bert_multilabel_persona/checkpoint-2155")
label_encoder = joblib.load("./bert_multilabel_persona/label_encoder.bin")  # This is a MultiLabelBinarizer

model.eval()

def predict_personas(text, threshold=0.5):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits).squeeze().numpy()  # Convert logits to probabilities

    # Predict all classes above the threshold
    predicted_labels = np.where(probs >= threshold)[0]
    return label_encoder.classes_[predicted_labels]


# Test
user_queries = [
    "We're 6 young adults (25-30) staying in Singapore for 6 days. We love outdoor activities, hiking trails, cycling, and unique experiences like night safaris. Include one rest day and show us images of adventure activities available.",
    "A family with a child in a wheelchair, maximize sightseeing in 3 days in Singapore",
    "We're tech conference attendees, maximize sightseeing in 3 days in Singapore with MRT travel",
    "We're foodies, find adventurous activities in Singapore (in mid-August)",
    "We want a wellness retreat, attend an art festival in Singapore for 3 days only daytime activities",
    "I'm a solo traveler, experience something unique in Singapore (from July 1 to July 5) avoiding crowded places",
    "A couple planning a honeymoon, experience something unique in Singapore (sometime in March) for a weekend including vegan options",
]
for query in user_queries:
    personas = predict_personas(query)
    print(personas)

['Adventure Seeker' 'Family Traveler']
['Adventure Seeker' 'Family Traveler']
[]
['Adventure Seeker' 'Foodie']
['Foodie']
['Adventure Seeker' 'Relaxation Seeker']
['Solo Traveler']
['Family Traveler']
