# Personas

1. Family Traveler – Kid-friendly attractions, safety-focused.
1. Backpacker – Budget-focused, immersive experiences.
1. Culture Explorer – Heritage sites, museums, local history.
1. Shopper – Malls, street markets, retail focus.
1. Foodie – Culinary experiences, hawker centers, local dishes.
1. Solo Traveler – Independent, flexible, self-guided.
1. Adventure Seeker – Thrill-based activities, outdoor sports.
1. Relaxation Seeker – Beaches, spas, slow-paced itinerary.
1. Luxury Seeker – High-end stays, exclusive experiences.
1. Digital Nomad – Remote work setup, co-working, long stays.


# Train model

## Load train dataset

In [3]:
import pandas as pd
import joblib

df = pd.read_csv("./persona_data/final_train_dataset.csv")

print(df.shape)
print(df.head())

from sklearn.preprocessing import MultiLabelBinarizer

# Split and clean multi-label string into list
label_lists = df['persona'].astype(str).apply(lambda x: [label.strip() for label in x.split(';') if label.strip()]).tolist()

# Binarize the labels
mlb = MultiLabelBinarizer()
label_matrix = mlb.fit_transform(label_lists)
print("Shape of label matrix:", label_matrix.shape)
print("Sample multi-hot encoded labels:\n", label_matrix[:5])

# Save the classes
label_classes = mlb.classes_
print("Multi-label classes:", label_classes)
# Save it after fitting
joblib.dump(mlb, "./roberta_multilabel_persona/label_encoder.bin")


(6896, 2)
                                          user_query  \
0  Planning to explore top shopping malls and loc...   
1  Seeking premium experiences like rooftop bars,...   
2  Interested in solo-friendly and safe spots to ...   
3  Planning to explore top shopping malls and loc...   
4  Seeking premium experiences like rooftop bars,...   

                               persona  
0              Family Traveler;Shopper  
1      Backpacker;Foodie;Luxury Seeker  
2                        Solo Traveler  
3             Adventure Seeker;Shopper  
4  Shopper;Solo Traveler;Luxury Seeker  
Shape of label matrix: (6896, 10)
Sample multi-hot encoded labels:
 [[0 0 0 0 1 0 0 0 1 0]
 [0 1 0 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 1 1]]
Multi-label classes: ['Adventure Seeker' 'Backpacker' 'Culture Explorer' 'Digital Nomad'
 'Family Traveler' 'Foodie' 'Luxury Seeker' 'Relaxation Seeker' 'Shopper'
 'Solo Traveler']


['./roberta_multilabel_persona/label_encoder.bin']

## Train

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class PersonaDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        # Remove token_type_ids if present (for RoBERTa)
        if "token_type_ids" in encoding:
            del encoding["token_type_ids"]
        
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [5]:
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast, RobertaConfig, RobertaForSequenceClassification

params = {
    'max_len': 128,
    'test_size': 0.2,
    'random_state': 42,
    'dropout': 0.3,
    'attention_dropout': 0.3,
}

# Tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
# Prepare data
texts = df["user_query"].tolist()
labels = label_matrix  # shape: (n_samples, n_labels)

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=params['test_size'], random_state=params['random_state'])
# Dataset
train_dataset = PersonaDataset(texts, labels, tokenizer, params['max_len'])
val_dataset = PersonaDataset(val_texts, val_labels, tokenizer, params['max_len'])

# Config and Model
num_labels = labels.shape[1]
config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=num_labels,
    problem_type="multi_label_classification",
    hidden_dropout_prob=params['dropout'],
    attention_probs_dropout_prob=params['attention_dropout']
)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config=config)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

training_args = TrainingArguments(
    output_dir="./roberta_multilabel_persona",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=1e-5,
    weight_decay=1e-5,        
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()  # apply sigmoid
    preds = (probs > 0.5).astype(int)  # multi-label threshold
    return {
        'f1': f1_score(labels, preds, average='micro'),
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='micro'),
        'recall': recall_score(labels, preds, average='micro')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.4414,0.393106,0.472988,0.14058,0.740489,0.347466
2,0.3443,0.310713,0.667155,0.234783,0.784238,0.580491
3,0.3022,0.260711,0.767694,0.385507,0.834831,0.710551
4,0.2747,0.242999,0.789375,0.414493,0.847222,0.738923
5,0.2685,0.234224,0.80597,0.451449,0.861182,0.757412




TrainOutput(global_step=2155, training_loss=0.34662053994402253, metrics={'train_runtime': 679.0963, 'train_samples_per_second': 50.773, 'train_steps_per_second': 3.173, 'total_flos': 2268180206100480.0, 'train_loss': 0.34662053994402253, 'epoch': 5.0})

# Inference

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import joblib

# Load the tokenizer exactly the same as training
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("./roberta_multilabel_persona/checkpoint-2155")
label_encoder = joblib.load("./roberta_multilabel_persona/label_encoder.bin")  # This is a MultiLabelBinarizer

model.eval()

def predict_personas(text, threshold=0.5):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits).squeeze().numpy()  # Convert logits to probabilities

    # Predict all classes above the threshold
    predicted_labels = np.where(probs >= threshold)[0]
    return label_encoder.classes_[predicted_labels]


# Test
user_queries = [
    "We're 6 young adults (25-30) staying in Singapore for 6 days. We love outdoor activities, hiking trails, cycling, and unique experiences like night safaris. Include one rest day and show us images of adventure activities available.",
    "A family with a child in a wheelchair, maximize sightseeing in 3 days in Singapore",
    "We're tech conference attendees, maximize sightseeing in 3 days in Singapore with MRT travel",
    "We're foodies, find adventurous activities in Singapore (in mid-August)",
    "We want a wellness retreat, attend an art festival in Singapore for 3 days only daytime activities",
    "I'm a solo traveler, experience something unique in Singapore (from July 1 to July 5) avoiding crowded places",
    "A couple planning a honeymoon, experience something unique in Singapore (sometime in March) for a weekend including vegan options",
]
for query in user_queries:
    personas = predict_personas(query)
    print(personas)

['Adventure Seeker' 'Family Traveler']
['Adventure Seeker' 'Family Traveler']
['Solo Traveler']
['Foodie']
['Luxury Seeker' 'Relaxation Seeker']
['Backpacker' 'Solo Traveler']
['Family Traveler']
