In [1]:
import pandas as pd
import random

random.seed(42)  # reproducible

# ──────────────────────────────────────────────────────────────────────────────
# 1) Expanded destination catalog (state, terrain, remoteness 1–5, base_risk)
#    Add/remove entries freely; generator adapts automatically.
# ──────────────────────────────────────────────────────────────────────────────
DESTINATIONS = [
    # Assam
    {"name":"Kaziranga National Park","state":"Assam","terrain":"Wildlife","rem":4,"base_risk":"High"},
    {"name":"Manas National Park","state":"Assam","terrain":"Wildlife","rem":4,"base_risk":"High"},
    {"name":"Majuli Island","state":"Assam","terrain":"Island","rem":3,"base_risk":"Medium"},
    {"name":"Kamakhya Temple","state":"Assam","terrain":"Temple","rem":1,"base_risk":"Low"},
    {"name":"Haflong","state":"Assam","terrain":"Hill","rem":3,"base_risk":"Medium"},
    {"name":"Sualkuchi","state":"Assam","terrain":"Town","rem":1,"base_risk":"Low"},

    # Meghalaya
    {"name":"Shillong","state":"Meghalaya","terrain":"Hill City","rem":2,"base_risk":"Low"},
    {"name":"Cherrapunji","state":"Meghalaya","terrain":"Plateau","rem":3,"base_risk":"Medium"},
    {"name":"Mawsynram","state":"Meghalaya","terrain":"Plateau","rem":3,"base_risk":"Medium"},
    {"name":"Nohkalikai Falls","state":"Meghalaya","terrain":"Waterfall","rem":3,"base_risk":"Medium"},
    {"name":"Mawlynnong Village","state":"Meghalaya","terrain":"Village","rem":2,"base_risk":"Low"},
    {"name":"Dawki (Umngot River)","state":"Meghalaya","terrain":"River","rem":2,"base_risk":"Medium"},
    {"name":"Umiam Lake","state":"Meghalaya","terrain":"Lake","rem":2,"base_risk":"Low"},
    {"name":"Krang Suri Falls","state":"Meghalaya","terrain":"Waterfall","rem":3,"base_risk":"Medium"},

    # Arunachal Pradesh
    {"name":"Tawang Monastery","state":"Arunachal Pradesh","terrain":"Monastery","rem":2,"base_risk":"Low"},
    {"name":"Sela Pass","state":"Arunachal Pradesh","terrain":"High Pass","rem":5,"base_risk":"High"},
    {"name":"Nuranang Waterfall","state":"Arunachal Pradesh","terrain":"Waterfall","rem":3,"base_risk":"Medium"},
    {"name":"Ziro Valley","state":"Arunachal Pradesh","terrain":"Valley","rem":3,"base_risk":"Medium"},
    {"name":"Namdapha National Park","state":"Arunachal Pradesh","terrain":"Wildlife","rem":5,"base_risk":"High"},
    {"name":"Bomdila","state":"Arunachal Pradesh","terrain":"Hill","rem":3,"base_risk":"Medium"},
    {"name":"Siang River","state":"Arunachal Pradesh","terrain":"River","rem":3,"base_risk":"Medium"},
    {"name":"Itanagar","state":"Arunachal Pradesh","terrain":"City","rem":1,"base_risk":"Low"},
    {"name":"Gorichen Peak","state":"Arunachal Pradesh","terrain":"Peak","rem":5,"base_risk":"High"},

    # Sikkim
    {"name":"Gangtok","state":"Sikkim","terrain":"City","rem":1,"base_risk":"Low"},
    {"name":"Tsomgo Lake (Changu)","state":"Sikkim","terrain":"Lake","rem":4,"base_risk":"Medium"},
    {"name":"Nathula Pass","state":"Sikkim","terrain":"High Pass","rem":5,"base_risk":"High"},
    {"name":"Yumthang Valley","state":"Sikkim","terrain":"Valley","rem":4,"base_risk":"High"},
    {"name":"Gurudongmar Lake","state":"Sikkim","terrain":"High Lake","rem":5,"base_risk":"High"},
    {"name":"Goechala","state":"Sikkim","terrain":"Trek Pass","rem":5,"base_risk":"High"},
    {"name":"Zero Point (Yumesamdong)","state":"Sikkim","terrain":"High Plateau","rem":5,"base_risk":"High"},
    {"name":"Pelling","state":"Sikkim","terrain":"Hill Town","rem":2,"base_risk":"Medium"},
    {"name":"Ravangla","state":"Sikkim","terrain":"Hill Town","rem":2,"base_risk":"Medium"},
    {"name":"Namchi","state":"Sikkim","terrain":"Town","rem":2,"base_risk":"Low"},

    # Nagaland
    {"name":"Kohima","state":"Nagaland","terrain":"City","rem":1,"base_risk":"Low"},
    {"name":"Dzukou Valley","state":"Nagaland","terrain":"Valley Trek","rem":4,"base_risk":"High"},
    {"name":"Khonoma Village","state":"Nagaland","terrain":"Village","rem":2,"base_risk":"Medium"},

    # Manipur
    {"name":"Imphal","state":"Manipur","terrain":"City","rem":1,"base_risk":"Low"},
    {"name":"Loktak Lake","state":"Manipur","terrain":"Lake","rem":2,"base_risk":"Low"},
    {"name":"Keibul Lamjao National Park","state":"Manipur","terrain":"Wildlife","rem":3,"base_risk":"Medium"},
    {"name":"Ukhrul","state":"Manipur","terrain":"Hill","rem":3,"base_risk":"Medium"},
    {"name":"Moreh","state":"Manipur","terrain":"Border Town","rem":4,"base_risk":"High"},

    # Mizoram
    {"name":"Aizawl","state":"Mizoram","terrain":"City","rem":1,"base_risk":"Low"},
    {"name":"Vantawng Falls","state":"Mizoram","terrain":"Waterfall","rem":3,"base_risk":"Medium"},
    {"name":"Phawngpui (Blue Mountain)","state":"Mizoram","terrain":"Peak","rem":4,"base_risk":"High"},
    {"name":"Champhai","state":"Mizoram","terrain":"Hill Town","rem":3,"base_risk":"Medium"},

    # Tripura
    {"name":"Tripura Sundari Temple","state":"Tripura","terrain":"Temple","rem":1,"base_risk":"Low"},
    {"name":"Neermahal","state":"Tripura","terrain":"Palace/Lake","rem":2,"base_risk":"Low"},
    {"name":"Unakoti","state":"Tripura","terrain":"Heritage Site","rem":2,"base_risk":"Medium"},
    {"name":"Jampui Hills","state":"Tripura","terrain":"Hill","rem":3,"base_risk":"Medium"},
    {"name":"Sepahijala Wildlife Sanctuary","state":"Tripura","terrain":"Wildlife","rem":2,"base_risk":"Medium"},

    # North Bengal (gateway to NE Himalaya)
    {"name":"Darjeeling","state":"West Bengal","terrain":"Hill Town","rem":2,"base_risk":"Low"},
    {"name":"Sandakphu","state":"West Bengal","terrain":"Trek Peak","rem":4,"base_risk":"High"},
    {"name":"Gorumara National Park","state":"West Bengal","terrain":"Wildlife","rem":3,"base_risk":"Medium"},
]

# ──────────────────────────────────────────────────────────────────────────────
# 2) Helper utilities
# ──────────────────────────────────────────────────────────────────────────────
def choose_people_bucket():
    # Slightly more 1–3, fewer large groups
    bucket = random.choices(["1","2","3","4","5+"], weights=[0.05,0.1,0.2,0.25,0.45])[0]
    num = 5 if bucket == "5+" else int(bucket)
    return num, bucket

def time_from_purpose(purpose):
    if purpose == "Night Stay":
        return random.choices(["Day","Night"], weights=[0.2,0.8])[0]
    elif purpose == "Trekking":
        return random.choices(["Day","Night"], weights=[0.75,0.25])[0]
    else:
        return random.choices(["Day","Night"], weights=[0.85,0.15])[0]

def mode_from_age_and_group(age, num_people):
    if num_people == 1:
        return random.choices(["Trekking","Bike","Cab"], weights=[0.5,0.3,0.2])[0]
    if num_people >= 4:
        return random.choices(["Bus","Cab"], weights=[0.6,0.4])[0]
    # age influence for 2–3 people
    if age == "18-25":
        return random.choices(["Cab","Bus","Trekking","Bike"], weights=[0.2,0.1,0.45,0.25])[0]
    elif age == "46+":
        return random.choices(["Cab","Bus","Trekking","Bike"], weights=[0.55,0.3,0.05,0.10])[0]
    else:
        return random.choice(["Cab","Bus","Trekking","Bike"])

def dist_from_remoteness(rem):
  if rem >= 4:  # very remote
    dist_safe = round(random.uniform(5, 12), 2)   # safe zones are far
    dist_risk = round(random.uniform(0.1, 1.5), 2) # risky zones are nearby
  elif rem == 3:  # moderately remote
    dist_safe = round(random.uniform(2, 6), 2)
    dist_risk = round(random.uniform(0.2, 2.5), 2)
  else:  # well-connected
    dist_safe = round(random.uniform(0.5, 3), 2)
    dist_risk = round(random.uniform(0.5, 3.0), 2)
  return dist_safe, dist_risk


def restricted_prob(base_risk, rem):
    base = {"High":0.30, "Medium":0.15, "Low":0.05}[base_risk]
    bump = max(0, rem - 3) * {"High":0.05, "Medium":0.03, "Low":0.02}[base_risk]
    return min(base + bump, 0.8)

def connectivity_from_rem(rem):
    if rem <= 2: return "Good"
    if rem == 3: return "Moderate"
    return "Poor"

def weather_random():
    return random.choices(["Clear","Rain","Fog"], weights=[0.7,0.2,0.1])[0]

def risk_weight_from_area(base_risk):
    return {"High":0.45, "Medium":0.25, "Low":0.10}[base_risk]

def remoteness_risk(rem):
    # small monotonic bump for remoteness
    return 0.03 * (rem - 1)  # rem=5 → +0.12

def mode_risk(mode):
    if mode in ["Trekking","Bike"]:
        return 0.10
    if mode == "Bus":
        return -0.02  # slightly safer
    return 0.00  # Cab neutral

def purpose_risk(purpose):
    return 0.05 if purpose in ["Night Stay","Trekking"] else 0.00

def weather_risk(weather):
    return {"Clear":0.0, "Rain":0.08, "Fog":0.06}[weather]

def connectivity_risk(conn):
    return {"Good":0.0, "Moderate":0.03, "Poor":0.07}[conn]

# ──────────────────────────────────────────────────────────────────────────────
# 3) Generate dataset
# ──────────────────────────────────────────────────────────────────────────────
destinations = DESTINATIONS  # alias
people_age_opts = ["18-25", "26-35", "36-45", "46+"]
gender_options = ["Male", "Female"]
purpose_options = ["Sightseeing", "Trekking", "Night Stay", "Pilgrimage"]

num_rows = 2000
rows = []

for _ in range(num_rows):
    d = random.choice(destinations)

    destination = d["name"]
    state = d["state"]
    terrain = d["terrain"]
    rem = d["rem"]
    area_risk = d["base_risk"]  # baseline area risk (not per-trip)

    age = random.choice(people_age_opts)
    gender = random.choice(gender_options)
    purpose = random.choice(purpose_options)

    num_people, people_bucket = choose_people_bucket()
    time_of_visit = time_from_purpose(purpose)
    mode_of_travel = mode_from_age_and_group(age, num_people)

    # Restricted zone likelihood scales with base risk + remoteness
    p_restricted = restricted_prob(area_risk, rem)
    inside_restricted = 1 if random.random() < p_restricted else 0

    # Distances scale with remoteness
    dist_safe, dist_risk = dist_from_remoteness(rem)

    # Extra realism factors
    weather = weather_random()
    connectivity = connectivity_from_rem(rem)

    # ── Risk score aggregation (0–1)
    risk_score = 0.0
    risk_score += risk_weight_from_area(area_risk)
    risk_score += remoteness_risk(rem)
    risk_score += 0.2 if time_of_visit == "Night" else 0.0
    if time_of_visit == "Night" and num_people == 1:  # solo at night
        risk_score += 0.25
    # group size
    if num_people == 1: risk_score += 0.10
    elif num_people in [2,3]: risk_score += 0.05
    elif num_people == 4: risk_score += 0.02
    # mode/purpose
    risk_score += mode_risk(mode_of_travel)
    risk_score += purpose_risk(purpose)
    # environment
    risk_score += weather_risk(weather)
    risk_score += connectivity_risk(connectivity)
    # restricted
    risk_score += 0.20 if inside_restricted == 1 else 0.0
    # distances (small signal)
    risk_score += dist_risk/100 + dist_safe/200
    # small noise & clamp
    risk_score += random.uniform(-0.02, 0.02)
    risk_score = max(0.0, min(1.0, risk_score))

 # Convert risk_score (0–1) into categorical safety levels
if risk_score < 0.25:
    safety_score = "Low"
elif risk_score < 0.5:
    safety_score = "Medium"
elif risk_score < 0.75:
    safety_score = "High"
else:
    safety_score = "Extreme"

    rows.append([
        destination, state, terrain, rem, area_risk,
        num_people, time_of_visit, mode_of_travel, age, gender,
        purpose, inside_restricted, weather, connectivity,
        round(dist_safe,2), round(dist_risk,2), round(risk_score,3), safety_score
    ])


# ──────────────────────────────────────────────────────────────────────────────
# 4) DataFrame + save
#     Keep your original columns, plus extras that you can drop if not needed.
# ──────────────────────────────────────────────────────────────────────────────
cols = [
    "Destination","State","Terrain","Remoteness_1to5","Area_Risk_Level",
    "Num_People","Time_of_Visit","Mode_of_Travel","Age","Gender",
    "Purpose","Inside_Restricted_Zone","Weather","Connectivity",
    "Distance_to_Safe_Zone_km","Distance_to_High_Risk_Zone_km","Risk_Score","Safety_Score"
]

df = pd.DataFrame(rows, columns=cols)
df.to_csv("tourist_safety_dataset_realistic_v2.csv", index=False)

# quick sanity prints
print("✅ Rows:", len(df), "| Unique destinations:", df['Destination'].nunique())
print(df['Safety_Score'].value_counts(normalize=True).round(3))


✅ Rows: 1 | Unique destinations: 1
Safety_Score
Extreme    1.0
Name: proportion, dtype: float64
