# SMART-TOURISM — Detailed Tourist Behavior Simulation (Notebook)
**Mục tiêu:** Mô phỏng hành vi khách du lịch, quan sát tải tại các điểm tham quan, và xuất lịch trình cuối cùng để phục vụ báo cáo.

Notebook này chia thành các cell rõ ràng: nhập tham số người dùng → load dữ liệu → model entities → chạy simulation → hiển thị logs + bảng + biểu đồ → flow visualization.

**Hướng dẫn:** chỉnh các tham số trong ô *User parameters* và chạy từng ô (Run cells) theo thứ tự.

## 1) User parameters
Chỉnh các biến dưới đây rồi chạy cell code tiếp theo để áp dụng đầu vào.

In [1]:
# --- User parameters (edit these values) ---
USER_PROVINCE = "Hà Nội"       # Province the user wants to simulate (exact match to data)
NUM_TOURISTS = 50              # Number of simulated tourists per day
NUM_DAYS = 5                   # Number of days to simulate
RANDOM_SEED = 42               # For reproducibility (set to None for true randomness)

# Behavior toggles
INCLUDE_PREFERENCES = True     # If True, tourists have category preferences
INCLUDE_WEATHER = True         # If True, weather affects choices (simple model)
PRINT_DAILY_LOGS = True        # If True, notebook will print per-day flow logs

print(f"Parameters set: province={USER_PROVINCE}, tourists={NUM_TOURISTS}, days={NUM_DAYS}")


Parameters set: province=Hà Nội, tourists=50, days=5


## 2) Imports and helper functions

In [2]:
# Imports and helper utilities
import os, json, random, math
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from collections import defaultdict
try:
    import pandas as pd
except Exception as e:
    pd = None
try:
    import matplotlib.pyplot as plt
except Exception as e:
    plt = None

# Utility to safely load JSONL
def load_jsonl(path):
    items = []
    if not os.path.exists(path):
        return items
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            try:
                items.append(json.loads(line))
            except:
                pass
    return items

# Display helper for DataFrame when available
def display_df(name, df):
    # Use caas_jupyter_tools if available (environment helper)
    try:
        from caas_jupyter_tools import display_dataframe_to_user
        display_dataframe_to_user(name, df)
    except Exception:
        # fallback to regular display if pandas available
        if pd is not None:
            display(df)
        else:
            print(df.head() if hasattr(df, 'head') else df)

print('Imports ready.')

Imports ready.


## 3) Load data from project `data/` (fallback sample if not present)

In [3]:
# Try to load tourism data from repository data folder
DATA_DIR = os.path.join(os.getcwd(), '..', 'data') if os.path.exists(os.path.join(os.getcwd(), '..', 'data')) else os.path.join(os.getcwd(), 'data')
if not os.path.exists(DATA_DIR):
    DATA_DIR = os.path.join(os.getcwd(), 'data')  # try current
tourism_path = os.path.join(DATA_DIR, 'vietnam_tourism.jsonl')

tourism_items = load_jsonl(tourism_path)
if tourism_items:
    print(f"Loaded {len(tourism_items)} tourism records from {tourism_path}")
else:
    print("No tourism jsonl found, using fallback sample data.")
    # fallback sample
    tourism_items = [
        { "name": "Bảo tàng A", "province": "Hà Nội", "category": "culture", "popularity": 8 },
        { "name": "Công viên B", "province": "Hà Nội", "category": "park", "popularity": 6 },
        { "name": "Biển C", "province": "Khánh Hòa", "category": "beach", "popularity": 9 },
        { "name": "Đảo D", "province": "Khánh Hòa", "category": "island", "popularity": 5 },
    ]

# Build a minimal structure: provinces -> places
provinces_raw = defaultdict(list)
for it in tourism_items:
    prov = it.get('province') or it.get('city') or 'Unknown'
    provinces_raw[prov].append(it)

print(f"Provinces discovered: {list(provinces_raw.keys())}")
# If USER_PROVINCE is not in data, show available provinces


Loaded 387 tourism records from /Users/phungquochuy/smart-tourism-system/backend/simulation/../data/vietnam_tourism.jsonl
Provinces discovered: ['An Giang', 'Bà Rịa - Vũng Tàu', 'Bình Dương', 'Bình Phước', 'Bình Thuận', 'Bình Định', 'Bạc Liêu', 'Bắc Giang', 'Bắc Kạn', 'Bắc Ninh', 'Hà Nội', 'Bến Tre', 'Cao Bằng', 'Cà Mau', 'Cần Thơ', 'Gia Lai', 'Hà Giang', 'Hà Nam', 'Hà Tĩnh', 'Hòa Bình', 'Hưng Yên', 'Hải Dương', 'Hải Phòng', 'Hậu Giang', 'Khánh Hòa', 'Kiên Giang', 'Kon Tum', 'Lai Châu', 'Lạng Sơn', 'Long An', 'Lào Cai', 'Lâm Đồng', 'Nam Định', 'Nghệ An', 'Ninh Bình', 'Ninh Thuận', 'Phú Thọ', 'Phú Yên', 'Quảng Bình', 'Quảng Nam', 'Quảng Ngãi', 'Quảng Ninh', 'Quảng Trị', 'Sóc Trăng', 'Sơn La', 'Thanh Hóa', 'Thái Bình', 'Thái Nguyên', 'Thừa Thiên Huế', 'Tiền Giang', 'TP. Hồ Chí Minh', 'Trà Vinh', 'Tuyên Quang', 'Tây Ninh', 'Vĩnh Long', 'Vĩnh Phúc', 'Yên Bái', 'Điện Biên', 'Đà Nẵng', 'Đắk Lắk', 'Đắk Nông', 'Đồng Nai', 'Đồng Tháp']


## 4) Entities and Behavior Model

In [4]:
@dataclass
class Place:
    name: str
    category: str
    popularity: float
    daily_limit: int = 20
    visitors_today: int = 0
    id: Optional[int] = None

@dataclass
class Tourist:
    id: int
    preference: Optional[str] = None   # preferred category
    flexible: bool = True              # willingness to choose alternate place
    budget: float = 1.0                # not used now, reserved for extension
    location: Optional[str] = None

class SimpleWeather:
    # Simple deterministic weather per day: 'clear' or 'rain'
    def __init__(self, seed=None):
        self.seed = seed
        self.rng = random.Random(seed)
    def day_weather(self, day):
        # 20% chance rain
        return 'rain' if self.rng.random() < 0.2 else 'clear'

def score_place_for_tourist(t: Tourist, p: Place, weather: str):
    # Score calculation (higher better)
    score = p.popularity
    # preference boost
    if t.preference and t.preference == p.category:
        score += 3
    # weather penalty for beach if raining
    if weather == 'rain' and p.category == 'beach':
        score -= 4
    return score
print('Model defined.')

Model defined.


## 5) Simulation Engine

In [5]:
class Simulator:
    def __init__(self, province_name, places_data, num_tourists=50, seed=42, include_preferences=True, include_weather=True):
        self.province_name = province_name
        self.places = []
        self.places_by_name = {}
        self.num_tourists = num_tourists
        self.seed = seed
        self.rng = random.Random(seed)
        self.include_preferences = include_preferences
        self.include_weather = include_weather
        # init places
        for i, pd in enumerate(places_data):
            name = pd.get('name') or pd.get('title') or f"Place_{i}"
            category = pd.get('category') or 'general'
            popularity = float(pd.get('popularity') or 5)
            # daily limit heuristic: popularity*3 + base
            daily_limit = int(pd.get('daily_limit', max(5, int(popularity*3))))
            place = Place(name=name, category=category, popularity=popularity, daily_limit=daily_limit, id=i)
            self.places.append(place)
            self.places_by_name[place.name] = place
        # create tourists with random preferences
        cats = list({p.category for p in self.places})
        self.tourists = []
        for i in range(self.num_tourists):
            if self.include_preferences:
                pref = self.rng.choice(cats) if cats else None
                flexible = self.rng.random() < 0.7
            else:
                pref = None
                flexible = True
            self.tourists.append(Tourist(id=i, preference=pref, flexible=flexible))
        self.weather_model = SimpleWeather(seed) if include_weather else None
        # statistics
        self.stats = {p.name: [0] for p in self.places}  # per-day visitors (will append)
        self.daily_logs = []

    def reset_daily_visitors(self):
        for p in self.places:
            p.visitors_today = 0

    def simulate(self, days=5, print_logs=False):
        self.reset_daily_visitors()
        self.daily_logs.clear()
        if self.weather_model:
            weather_rng = self.weather_model
        else:
            weather_rng = None
        for day in range(1, days+1):
            weather = weather_rng.day_weather(day) if weather_rng else 'clear'
            self.daily_logs.append(f"--- Day {day} | weather: {weather} ---")
            # shuffle tourists each day
            self.rng.shuffle(self.tourists)
            for t in self.tourists:
                # score all candidate places
                scored = [(p, score_place_for_tourist(t, p, weather)) for p in self.places]
                # sort descending by score
                scored.sort(key=lambda x: x[1], reverse=True)
                visited = False
                for p, sc in scored:
                    # try top choice; if full and flexible, try next
                    if p.visitors_today < p.daily_limit:
                        p.visitors_today += 1
                        t.location = p.name
                        visited = True
                        self.daily_logs.append(f"Tourist {t.id} -> {p.name} (score={sc:.1f}, pref={t.preference}, flexible={t.flexible})")
                        break
                    else:
                        if not t.flexible:
                            # won't try others
                            self.daily_logs.append(f"Tourist {t.id} prefers {p.name} but full -> no_visit (non-flex)")
                            visited = False
                            break
                        else:
                            # try next option
                            continue
                if not visited:
                    t.location = None
                    self.daily_logs.append(f"Tourist {t.id} -> No visit")
            # end day: record stats
            for p in self.places:
                self.stats[p.name].append(p.visitors_today)
                self.daily_logs.append(f"{p.name} visitors_today: {p.visitors_today} / limit {p.daily_limit}")
            # reset daily counters for next day
            self.reset_daily_visitors()
        return self.daily_logs

    def itinerary_summary(self):
        # produce aggregated itinerary (most visited per day not stored individually here)
        df_rows = []
        days = len(next(iter(self.stats.values()))) - 1  # because we init with [0]
        for pname, counts in self.stats.items():
            total = sum(counts[1:])  # skip initial zero
            df_rows.append({'place': pname, 'total_visitors': total, 'per_day': counts[1:]})
        if pd is not None:
            df = pd.DataFrame(df_rows).sort_values('total_visitors', ascending=False).reset_index(drop=True)
        else:
            df = df_rows
        return df

print('Simulator class ready.')

Simulator class ready.


## 6) Instantiate Simulator (using user inputs)

In [6]:
# Prepare places for chosen province
AVAILABLE_PROVINCES = list(provinces_raw.keys())
if USER_PROVINCE not in AVAILABLE_PROVINCES:
    print('Warning: chosen province not in data. Available provinces:', AVAILABLE_PROVINCES)
    # fallback to first
    province_name = AVAILABLE_PROVINCES[0] if AVAILABLE_PROVINCES else USER_PROVINCE
else:
    province_name = USER_PROVINCE

places_data = provinces_raw.get(province_name, []) or tourism_items  # if empty use full list
sim = Simulator(province_name=province_name, places_data=places_data, num_tourists=NUM_TOURISTS, seed=RANDOM_SEED, include_preferences=INCLUDE_PREFERENCES, include_weather=INCLUDE_WEATHER)
print(f"Simulator ready for province={province_name} with {len(sim.places)} places and {len(sim.tourists)} tourists.")

Simulator ready for province=Hà Nội with 10 places and 50 tourists.


## 7) Run Simulation

In [7]:
# Run simulation and capture logs
logs = sim.simulate(days=NUM_DAYS, print_logs=PRINT_DAILY_LOGS)
# Optionally print logs (controlled by PRINT_DAILY_LOGS)
if PRINT_DAILY_LOGS:
    for l in logs:
        print(l)

--- Day 1 | weather: clear ---
Tourist 30 -> Chợ Ninh Hiệp (score=8.0, pref=Mua sắm, flexible=False)
Tourist 15 -> Hồ Hoàn Kiếm (Hồ Gươm) (score=8.0, pref=Đô thị, flexible=True)
Tourist 6 -> Lăng Chủ tịch Hồ Chí Minh (score=8.0, pref=Văn hóa - Lịch sử, flexible=True)
Tourist 35 -> Lăng Chủ tịch Hồ Chí Minh (score=8.0, pref=Văn hóa - Lịch sử, flexible=True)
Tourist 11 -> Chùa Hương (Hương Sơn) (score=8.0, pref=Tâm linh, flexible=True)
Tourist 25 -> Phố cổ Hà Nội (score=8.0, pref=Văn hóa, flexible=False)
Tourist 47 -> Lăng Chủ tịch Hồ Chí Minh (score=8.0, pref=Văn hóa - Lịch sử, flexible=True)
Tourist 2 -> Chùa Hương (Hương Sơn) (score=8.0, pref=Tâm linh, flexible=False)
Tourist 22 -> Chợ Ninh Hiệp (score=8.0, pref=Mua sắm, flexible=True)
Tourist 18 -> Lăng Chủ tịch Hồ Chí Minh (score=8.0, pref=Văn hóa - Lịch sử, flexible=True)
Tourist 12 -> Chợ Ninh Hiệp (score=8.0, pref=Mua sắm, flexible=True)
Tourist 39 -> Hồ Hoàn Kiếm (Hồ Gươm) (score=8.0, pref=Đô thị, flexible=True)
Tourist 23 -> Ph

## 8) Itinerary summary and visualizations

In [8]:
summary_df = sim.itinerary_summary()
print('Itinerary summary (places ranked by total visitors):')
display_df('Itinerary Summary', summary_df)

# Simple bar chart of total visitors per place
try:
    if plt is None:
        print('matplotlib not available to draw charts.')
    else:
        if pd is not None and not isinstance(summary_df, list):
            x = summary_df['place']
            y = summary_df['total_visitors']
        else:
            x = [r['place'] for r in summary_df]
            y = [r['total_visitors'] for r in summary_df]
        plt.figure(figsize=(8,4))
        plt.bar(x, y)
        plt.title('Total visitors per place (simulated)')
        plt.ylabel('Total visitors')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
except Exception as e:
    print('Unable to plot:', e)

Itinerary summary (places ranked by total visitors):
[{'place': 'Chợ Ninh Hiệp', 'total_visitors': 55, 'per_day': [11, 11, 11, 11, 11]}, {'place': 'Lăng Chủ tịch Hồ Chí Minh', 'total_visitors': 55, 'per_day': [11, 11, 11, 11, 11]}, {'place': 'Hồ Hoàn Kiếm (Hồ Gươm)', 'total_visitors': 45, 'per_day': [9, 9, 9, 9, 9]}, {'place': 'Văn Miếu - Quốc Tử Giám', 'total_visitors': 0, 'per_day': [0, 0, 0, 0, 0]}, {'place': 'Hoàng thành Thăng Long', 'total_visitors': 0, 'per_day': [0, 0, 0, 0, 0]}, {'place': 'Cầu Long Biên', 'total_visitors': 0, 'per_day': [0, 0, 0, 0, 0]}, {'place': 'Phố cổ Hà Nội', 'total_visitors': 50, 'per_day': [10, 10, 10, 10, 10]}, {'place': 'Làng gốm Bát Tràng', 'total_visitors': 0, 'per_day': [0, 0, 0, 0, 0]}, {'place': 'Quảng trường Ba Đình', 'total_visitors': 0, 'per_day': [0, 0, 0, 0, 0]}, {'place': 'Chùa Hương (Hương Sơn)', 'total_visitors': 45, 'per_day': [9, 9, 9, 9, 9]}]
matplotlib not available to draw charts.


## 9) Flow visualization (high-level)

In [9]:
# Create a simple linear flow diagram showing major steps using matplotlib annotations
try:
    if plt is None:
        print('matplotlib not available for flow visualization.')
    else:
        steps = ['Init data', 'Create entities', 'Simulate per day', 'Decision rules', 'Collect stats', 'Output itinerary']
        fig, ax = plt.subplots(figsize=(10,2))
        ax.axis('off')
        xpos = list(range(len(steps)))
        ypos = [0]*len(steps)
        for i, s in enumerate(steps):
            ax.annotate(s, xy=(xpos[i], ypos[i]), xytext=(xpos[i], 0), bbox=dict(boxstyle='round', alpha=0.2))
            if i < len(steps)-1:
                ax.annotate('', xy=(xpos[i]+0.3, ypos[i]), xytext=(xpos[i+1]-0.3, ypos[i]),
                            arrowprops=dict(arrowstyle='->', lw=1.5))
        ax.set_xlim(-0.5, len(steps)-0.5)
        ax.set_ylim(-1,1)
        plt.title('High-level simulation flow')
        plt.axis('off')
        plt.show()
except Exception as e:
    print('Flow visualization failed:', e)

matplotlib not available for flow visualization.


## 10) Export final itinerary and logs

In [10]:
import os, json

# Lấy thư mục gốc backend (folder chứa app/, tests/, simulation/, ...)
BACKEND_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Tạo thư mục simulation tại đúng backend/
out_dir = os.path.join(BACKEND_ROOT, "simulation")
os.makedirs(out_dir, exist_ok=True)

# File output
out_path = os.path.join(out_dir, "simulation_itinerary_output.json")

# Build output
output = {
    'province': sim.province_name,
    'num_tourists': sim.num_tourists,
    'days': NUM_DAYS,
    'summary': summary_df.to_dict(orient='records') if hasattr(summary_df, 'to_dict') else summary_df,
    'logs': logs
}

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print("Exported simulation output to", out_path)


Exported simulation output to /Users/phungquochuy/smart-tourism-system/backend/simulation/simulation_itinerary_output.json


### Ghi chú kết thúc
- Chạy tuần tự các cell để hiểu flow.
- Bạn có thể mở file `/mnt/data/simulation_itinerary_output.json` để lấy kết quả tự động dùng trong báo cáo.
- Muốn thêm weather model, distance constraints, hoặc multi-province routing thì mình sẽ mở rộng simulator. 