# Dataset creation process

This notebook aims to create a dataset of mountain names and generate new training sentences for Named Entity Recognition (NER) tasks. It includes importing data, extracting mountain names, and saving the results.

## 1. Import necessary libraries

In [1]:
# Import key libraries for data manipulation and processing

import json
import random
import pyperclip
import re
import pandas as pd

## 2. Load dataset containing mountain names

In [4]:
# Load CSV file that contains information about various mountains including their synonyms
file_path = "../data/raw_data/Top_montains_data.csv"
mountain_df = pd.read_csv(file_path)
mountain_df.head()

Unnamed: 0,rank,Mountain name(s),Height_m,Height_ft,Prominence_m,Prominence_ft,Range,coordinates,Parent mountain,ascents_first,ascents_before_2004_successful,ascents_before_2004_unsuccessful,country
0,1,Mount Everest\nSagarmath\nChomolungma,8848.86,29031.7,8848,29029,Mahalangur Himalaya,27°59′17″N 86°55′31″E,—,1953,145,121,Nepal\nChina
1,2,K2,8611.0,28251.0,4020,13190,Baltoro Karakoram,35°52′53″N 76°30′48″E,Mount Everest,1954,45,44,Pakistan\nChina
2,3,Kangchenjunga,8586.0,28169.0,3922,12867,Kangchenjunga Himalaya,27°42′12″N 88°08′51″E,Mount Everest,1955,38,24,Nepal\nIndia
3,4,Lhotse,8516.0,27940.0,610,2000,Mahalangur Himalaya,27°57′42″N 86°55′59″E,Mount Everest,1956,26,26,Nepal\nChina
4,5,Makalu,8485.0,27838.0,2378,7802,Mahalangur Himalaya,27°53′23″N 87°05′20″E,Mount Everest,1955,45,—,Nepal\nChina


## 3. Extract mountains names from the dataset

In [6]:
# Define a function to extract the main mountain names and their synonyms
# This function returns a list of lists where each inner list contains the primary name and it's synonyms
def extract_mountain_names_from_csv(df):

    """
    Extract mountain names and their synonyms from the dataset.

    Args:
        df (DataFrame): A dataframe containing mountain information.

    Returns:
        list: A list of lists containing the main names and synonyms of mountains.
    """
    
    mountain_names = []
    for _, row in df.iterrows():
        # Check if mountain names separated by new lines or commas
        names = row["Mountain name(s)"].split('\n') if '\n' in row["Mountain name(s)"] else row["Mountain name(s)"].split(',')
        names = [name.strip() for name in names if name.strip()] # Clean whitespace
        mountain_names.append(names) # Add the cleaned names to the list

    return mountain_names

# Extract the cleaned mountain names
mountain_names = extract_mountain_names_from_csv(mountain_df)
print(mountain_names)

[['Mount Everest', 'Sagarmath', 'Chomolungma'], ['K2'], ['Kangchenjunga'], ['Lhotse'], ['Makalu'], ['Cho Oyu'], ['Dhaulagiri I'], ['Manaslu'], ['Nanga Parbat'], ['Annapurna I'], ['Gasherbrum I', 'Hidden Peak', 'K5'], ['Broad Peak'], ['Gasherbrum II', 'K4'], ['Shishapangma', 'Gosainthan'], ['Gyachung Kang'], ['Gasherbrum III', 'K3a'], ['Annapurna II'], ['Gasherbrum IV', 'K3'], ['Himalchuli'], ['Distaghil Sar'], ['Ngadi Chuli'], ['Nuptse'], ['Khunyang Chhish'], ['Masherbrum', 'K1'], ['Nanda Devi'], ['Chomo Lonzo'], ['Batura Sar'], ['Rakaposhi'], ['Namcha Barwa'], ['Kanjut Sar'], ['Kamet'], ['Dhaulagiri II'], ['Saltoro Kangri', 'K10'], ['Kumbhakarna', 'Jannu'], ['Tirich Mir'], ['Molamenqing'], ['Gurla Mandhata'], ['Saser Kangri I', 'K22'], ['Chogolisa'], ['Dhaulagiri IV'], ['Kongur Tagh'], ['Dhaulagiri V'], ['Shispare'], ['Trivor'], ['Gangkhar Puensum'], ['Gongga Shan', 'Minya Konka'], ['Annapurna III'], ['Skyang Kangri'], ['Changtse'], ['Kula Kangri'], ['Kongur Tiube'], ['Annapurna IV'],

## 4. Divide mountain names into groups

In [8]:
def divide_mountain_names(mountain_names):

    """
    Divide mountain names into separate groups.

    Args:
        mountain_names (list): A list of mountain names and their synonyms.

    Returns:
        list: A list of grouped mountain names.
    """

    separate_mountains = [] # List to hold mountains that have synonyms (treated as separate groups)
    grouped_mountains = [] # List to hold groups of mountains without synonyms
    current_group = [] # Temporary list to create groups of 2-4 mountains

    # Shuffle the list to introduce randomness in grouping
    random.shuffle(mountain_names)

    for names in mountain_names:
        if len(names) >= 2:
            # If the mountain has synonyms, add it directly to separate_mountains
            separate_mountains.append(names)
        else:
            # If the mountain does not have synonyms, add it to the current group
            current_group.extend(names)
            # Once the current group has between 2 to 4 mountains, add it to grouped_mountains
            if 2 <= len(current_group) <= 4:
                grouped_mountains.append(current_group)
                current_group = []
    
    # If there are remaining mountains in the current group, add them as a group
    if current_group:
        grouped_mountains.append(current_group)
    
    return separate_mountains + grouped_mountains

divided_mountain_names = divide_mountain_names(mountain_names)
print(divided_mountain_names)

[['Labuche Kang III', 'Labuche Kang East'], ['Gongga Shan', 'Minya Konka'], ['Baintha Brakk', 'The Ogre'], ['Jomolhari', 'Chomo Lhari'], ['Jengish Chokusu', 'Tömür', 'Pik Pobedy'], ['The Crown', 'Huang Guan Shan'], ['Gimmigela Chuli', 'The Twins'], ['Masherbrum', 'K1'], ['Saltoro Kangri', 'K10'], ['Shishapangma', 'Gosainthan'], ['Kangpenqing', 'Gang Benchhen'], ['Gasherbrum II', 'K4'], ['Kangphu Kang', 'Shimokangri'], ['Gasherbrum I', 'Hidden Peak', 'K5'], ['Sunanda Devi', 'Nanda Devi East'], ['Yangra', 'Ganesh I'], ['Noijin Kangsang', 'Norin Kang'], ['Gasherbrum IV', 'K3'], ['Gasherbrum III', 'K3a'], ['Saser Kangri I', 'K22'], ['K6', 'Baltistan Peak'], ['Kumbhakarna', 'Jannu'], ['Mount Everest', 'Sagarmath', 'Chomolungma'], ['Muztagh Ata', 'Annapurna Dakshin (Annapurna South)'], ['Apsarasas Kangri', 'Pumari Chhish'], ['Kula Kangri', 'Annapurna IV'], ['Mukut Parbat', 'Kanjut Sar'], ['Dhaulagiri I', 'Ultar'], ['Annapurna I', 'Skil Brum'], ['Kangchenjunga', 'Skyang Kangri'], ['Noshaq', '

## 5. Copy groups to clipboard

In [10]:
# Convert the groups into a readable format and copy them to the clipboard for easy access
groups_string = json.dumps(divided_mountain_names, indent=4)
pyperclip.copy(groups_string)
print("Groups of mountains have been copied to the clipboard.")

Groups of mountains have been copied to the clipboard.


## 6. Generate texts with mountain names using ChatGPT

To generate textual data containing mountain names for Named Entity Recognition (NER) model training, we used ChatGPT with a specific prompt structure. The objective is to create sentences where each mountain name appears at the beginning, middle, and end of the text.
### Steps:
1. **Define the Prompt**:
   First, the mountain groups (which include both names and synonyms) were copied and sent to ChatGPT using the following prompt:

   "Write short texts about mountains. They are divided into groups, some groups contain mountain names and their synonyms, some groups contain random mountains from 2 to 4 in a group. You need to make the following texts:

   They must necessarily contain the names of all the mountains in the group.
   The text must be the most ordinary, without any markings, lists, etc., the most ordinary text, separated by line breaks.
   For each group of names, you need to make as many texts as will satisfy the condition: each name must appear at the beginning, in the middle of the text, and at the end.

   For example, the group: ['Mount Everest', 'Sagarmath', 'Chomolungma'].
   For it, you need to make three sentences:

   1. In which Mount Everest is at the beginning, Sagarmath is in the middle, Chomolungma is at the end.
   2. Chomolungma is at the beginning, Mount Everest is in the middle, Sagarmath is at the end.
   3. Sagarmath - at the beginning, Chomolungma - in the middle, Mount Everest - at the end.

   That's it, three will be enough. These are texts, or rather sentences, for training the NER model, so it needs to be able to identify any name anywhere in the text."

2. **Generate the Texts**:

   The response from ChatGPT provided multiple sentences for each group of mountain names, ensuring that each mountain appears in different positions (beginning, middle, and end) within the sentence.

3. **Saving the Generated Text**:

   Once all the texts were generated, they were copied and saved into a text file. This data will be used for further processing or model training.
   Save the generated texts in a file named "generated_mountain_sentences.txt".

## 7. Load generated mountain sentences

In [11]:
# Load the generated mountain sentences that will be used for annotation
generated_sentences_path = "../data/raw_data/generated_mountain_sentences.txt"
with open(generated_sentences_path, 'r', encoding="utf-8") as file:
    generated_sentences = file.readlines()

print(generated_sentences)

['Jengish Chokusu stands tall in the Tien Shan range, with Tömür as its other name, and Pik Pobedy is another towering peak in the same area.  \n', 'Tömür is also known for its majesty, often overshadowed by Jengish Chokusu, but Pik Pobedy stands equal to them all.  \n', 'Pik Pobedy is a striking mountain, sharing the same legacy as Jengish Chokusu and Tömür.\n', '\n', 'Labuche Kang III is a prominent peak, while Labuche Kang East is equally impressive in its beauty.  \n', 'Labuche Kang East rises alongside Labuche Kang III, both mountains being key features of the Tibetan landscape.\n', '\n', 'Baintha Brakk is often referred to as The Ogre due to its challenging ascent.  \n', 'The Ogre, or Baintha Brakk, is known for its steep, jagged peaks.\n', '\n', 'Gasherbrum IV, also called K3, is one of the most difficult mountains to climb.  \n', 'K3, or Gasherbrum IV, towers over the surrounding landscape with its icy ridges.\n', '\n', 'Kangphu Kang and Shimokangri rise majestically in the Him

## 8. Annotate texts automatically with mountain names

In [12]:
# Define a function that will identify all mountain names in the text and annotate them using BIO tagging format
# B - Beginning, I - Inside, O - Outside (for named entity recognition tasks)

def annotate_texts_bio(sentences, groups):

    """
    Annotate texts with mountain names using BIO tagging.

    Args:
        sentences (list): A list of sentences to be annotated.
        groups (list): A list of mountain groups for annotation.

    Returns:
        list: A list of annotated sentences in BIO format.
    """

    annotated_sentences = []
    
    for sentence in sentences:
        tokens = re.findall(r"\w+|[.,!?;]", sentence.strip()) # Split the sentence into tokens, keeping punctuation as separate tokens
        if not tokens:
            continue # Skip empty sentences
        labels = ['O'] * len(tokens) # Initialize all labels as 'O'

        for group in groups:
            for name in group:
                name_tokens = re.findall(r"\w+|[.,!?;]", name) # Split the mountain name into tokens, keeping punctuation as separate tokens
                for i in range(len(tokens)):
                    # Check if the tokens match the mountain name tokens
                    if tokens[i:i + len(name_tokens)] == name_tokens:
                        labels[i] = "B-MOUNTAIN_NAME" # Mark the beginning of the mountain name
                        for j in range(1, len(name_tokens)):
                            labels[i + j] = "I-MOUNTAIN_NAME" # Mark the inside tokens of the mountain name

        annotated_sentences.append({"tokens": tokens, "labels": labels})
    
    return annotated_sentences

# Annotate the generated mountain sentences using BIO tagging
annotated_sentences_bio = annotate_texts_bio(generated_sentences, divided_mountain_names)
print(annotated_sentences_bio)

[{'tokens': ['Jengish', 'Chokusu', 'stands', 'tall', 'in', 'the', 'Tien', 'Shan', 'range', ',', 'with', 'Tömür', 'as', 'its', 'other', 'name', ',', 'and', 'Pik', 'Pobedy', 'is', 'another', 'towering', 'peak', 'in', 'the', 'same', 'area', '.'], 'labels': ['B-MOUNTAIN_NAME', 'I-MOUNTAIN_NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN_NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN_NAME', 'I-MOUNTAIN_NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'tokens': ['Tömür', 'is', 'also', 'known', 'for', 'its', 'majesty', ',', 'often', 'overshadowed', 'by', 'Jengish', 'Chokusu', ',', 'but', 'Pik', 'Pobedy', 'stands', 'equal', 'to', 'them', 'all', '.'], 'labels': ['B-MOUNTAIN_NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN_NAME', 'I-MOUNTAIN_NAME', 'O', 'O', 'B-MOUNTAIN_NAME', 'I-MOUNTAIN_NAME', 'O', 'O', 'O', 'O', 'O', 'O']}, {'tokens': ['Pik', 'Pobedy', 'is', 'a', 'striking', 'mountain', ',', 'sharing', 'the', 'same', 'legacy', 'as', 'Jengish', 'Choku

## 9. Save annotated data to JSON file

In [14]:
# Save the annotated data in JSON format for future use in training NER models
annotated_output_path_bio = "../data/processed_data/annotated_mountain_sentences_bio.json"
with open(annotated_output_path_bio, "w", encoding="utf-8") as json_file:
    json.dump(annotated_sentences_bio, json_file, indent=4)

print(f"Annotated sentences saved to {annotated_output_path_bio}")

Annotated sentences saved to ../data/processed_data/annotated_mountain_sentences_bio.json
