In [1]:
import torch
import numpy as np
import random

In [2]:
def generate_synthetic_data(mountains, templates, num_samples=1000, split_ratio=0.8, save_path=None):
    texts, labels_list = [], [] # list of label sequences for each text

    for _ in range(num_samples):
        mountain = random.choice(mountains)
        text = random.choice(templates).format(mountain = mountain)
        # Simple labeling: 0=O, 1=B-MOUNTAIN
        words = text.split()
        labels = ["O"] * len(words)
        for i, word in enumerate(words):
            if word.strip('.,!?') == mountain:
                labels[i] = "B-MOUNTAIN"
        texts.append(text)
        labels_list.append(labels)

    split = int(split_ratio * num_samples)
    return texts[:split], labels_list[:split], texts[split:], labels_list[split:]

In [3]:
# List of mountain names for synthetic data generation
mountains = [
    "Everest", "K2", "Kangchenjunga", "Lhotse", "Makalu", "Dhaulagiri", 
    "Annapurna", "Gasherbrum", "Shishapangma", "Aconcagua", "Denali", "Eiger",
    "Elbrus", "Vinson", "Carstensz", "Kosciuszko", "Matterhorn", "Huascaran",
    "Fuji", "Rainier", "McKinley", "Logan", "Citlaltepetl", "Kilimanjaro",
    "Masherbrum", "Rakaposhi", "Noshaq", "Pobeda", "Kongur", "Batura", 
    "Korzhenevskaya", "Manaslu", "Chimborazo", "Changtse", "Jungfrau" 
]

In [4]:
# Generated sentence templates for synthetic data generation
sentences_templates = [
    "I successfully summited {mountain} in 2024.",
    "The expedition to {mountain} was canceled due to bad weather.",
    "Climbing {mountain} is every mountaineer's dream.",
    "The view from the top of {mountain} is absolutely breathtaking.",
    "Many climbers have lost their lives attempting {mountain}.",
    "I trained for two years to climb {mountain}.",
    "The south face of {mountain} is considered the most dangerous route.",
    "We set up base camp at the foot of {mountain}.",
    "The summit of {mountain} was covered in fresh snow.",
    "Local guides know the safest paths up {mountain}.",
    "I carried an oxygen tank above 8000m on {mountain}.",
    "The weather on {mountain} changes rapidly and unpredictably.",
    "{mountain} has never been climbed in winter.",
    "We reached Camp 4 on {mountain} after 12 hours of climbing.",
    "The Hillary Step on {mountain} is a famous bottleneck.",
    "Sherpas play a crucial role in expeditions to {mountain}.",
    "I left a prayer flag at the summit of {mountain}.",
    "The Khumbu Icefall is the most dangerous section below {mountain}.",
    "{mountain} is part of the Seven Summits challenge.",
    "The north ridge of {mountain} remains unclimbed.",
    "I met legendary climbers at the {mountain} base camp.",
    "{mountain} casts a long shadow over the surrounding valleys.",
    "The sunrise from {mountain} is a once-in-a-lifetime experience.",
    "We used fixed ropes on the steep sections of {mountain}.",
    "The avalanche risk on {mountain} is extremely high in spring.",
    "I wrote my name in the summit register of {mountain}.",
    "The wind at the top of {mountain} was over 100 km/h.",
    "{mountain} is technically challenging but not extremely high.",
    "We crossed a crevasse field to reach {mountain}.",
    "The documentary about climbing {mountain} won an award.",
    "{mountain} has multiple named routes of varying difficulty.",
    "I trained on smaller peaks before attempting {mountain}.",
    "The permit to climb {mountain} costs thousands of dollars.",
    "{mountain} is visible from hundreds of kilometers away.",
    "We celebrated with champagne after descending {mountain}.",
    "The death zone begins at 7500m on {mountain}.",
    "{mountain} has claimed more lives than any other 8000er.",
    "I carried a photo of my family to the summit of {mountain}.",
    "The rock quality on {mountain} is notoriously poor.",
    "We used crampons and ice axes throughout the climb on {mountain}.",
    "{mountain} is part of a larger massif with several sub-peaks.",
    "I felt the effects of altitude sickness above {mountain}'s high camp.",
    "The local monastery blessed our expedition to {mountain}."
]

In [None]:
# Start data generation
train_texts, train_labels, val_texts, val_labels = generate_synthetic_data(mountains, sentences_templates)

In [None]:
# Combine texts and labels into datasets
train_data = list(zip(train_texts, train_labels))
val_data = list(zip(val_texts, val_labels))

train_data[:5]

[('Fuji is visible from hundreds of kilometers away.',
  ['B-MOUNTAIN', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
 ('I wrote my name in the summit register of Elbrus.',
  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN']),
 ('The wind at the top of K2 was over 100 km/h.',
  ['O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN', 'O', 'O', 'O', 'O']),
 ('K2 has multiple named routes of varying difficulty.',
  ['B-MOUNTAIN', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
 ('The wind at the top of Everest was over 100 km/h.',
  ['O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN', 'O', 'O', 'O', 'O'])]

In [7]:
val_data[:5]

[('K2 has never been climbed in winter.',
  ['B-MOUNTAIN', 'O', 'O', 'O', 'O', 'O', 'O']),
 ('The summit of Noshaq was covered in fresh snow.',
  ['O', 'O', 'O', 'B-MOUNTAIN', 'O', 'O', 'O', 'O', 'O']),
 ('The summit of Kongur was covered in fresh snow.',
  ['O', 'O', 'O', 'B-MOUNTAIN', 'O', 'O', 'O', 'O', 'O']),
 ('Sherpas play a crucial role in expeditions to Kangchenjunga.',
  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN']),
 ('Local guides know the safest paths up Huascaran.',
  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN'])]

Create text files for saving dataset:

In [8]:
# Function to save synthetic data to a text file
def save_synthetic_data(texts, labels_list, save_path=None):
    if save_path is not None:
        with open(save_path, 'w', encoding='utf-8') as f:
            for text, labels in zip(texts, labels_list):
                for word, label in zip(text.split(), labels):
                    f.write(f"{word} {label}\n")
                f.write("\n")

In [9]:
# Create text file for saving train dataset:
save_synthetic_data(train_texts, train_labels, save_path='train_data.txt')
save_synthetic_data(val_texts, val_labels, save_path='val_data.txt')

In [11]:
# Show first 20 lines of the saved file
with open('train_data.txt', 'r', encoding='utf-8') as f:
    for _ in range(20):
        print(f.readline().strip())

Fuji B-MOUNTAIN
is O
visible O
from O
hundreds O
of O
kilometers O
away. O

I O
wrote O
my O
name O
in O
the O
summit O
register O
of O
Elbrus. B-MOUNTAIN

