## Imports

In [None]:
import re
import logging
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from typing import List

In [15]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    force=True)
logger = logging.getLogger(__name__)

## Constants

In [16]:
ANIMAL_CLASSES = [
    'butterfly', 'cat', 'chicken', 'cow', 'dog',
    'elephant', 'horse', 'sheep', 'spider', 'squirrel'
]

DATA_DIR = Path('../data/texts')
TEXT_PATH = DATA_DIR / 'texts.txt'
OUTPUT_DIR = DATA_DIR
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

## Utility Functions

In [17]:
def read_text_dataset(file_path: str) -> List[str]:
    """Reads a text file and returns a list of non-empty sentences."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    logger.info(f"Loaded {len(lines)} sentences from {file_path}")
    return lines


def tokenize(sentence: str) -> List[str]:
    """Tokenizes a sentence into lowercase words, removing punctuation."""
    return re.findall(r'\w+', sentence.lower())


def create_ner_tags(tokens: List[str], animal_classes: List[str]) -> List[str]:
    """Assigns 'B-ANIMAL' to animal class words, otherwise 'O'."""
    return ['B-ANIMAL' if token in animal_classes else 'O' for token in tokens]


def save_json(data: pd.DataFrame, path: str):
    """Saves a DataFrame to a JSON file."""
    data.to_json(path, orient='records', indent=2)
    logger.info(f"Saved dataset to {path} ({len(data)} records)")

## Create and Tag the Dataset

In [18]:
sentences = read_text_dataset(TEXT_PATH)

df = pd.DataFrame({'sentence': sentences})

df['tokens'] = df['sentence'].apply(tokenize)
df['labels'] = df['tokens'].apply(lambda tokens: create_ner_tags(tokens, ANIMAL_CLASSES))

df.info()
display(df.head())

2025-07-04 13:14:51,010 - INFO - Loaded 1246 sentences from ..\data\texts\texts.txt


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246 entries, 0 to 1245
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  1246 non-null   object
 1   tokens    1246 non-null   object
 2   labels    1246 non-null   object
dtypes: object(3)
memory usage: 29.3+ KB


Unnamed: 0,sentence,tokens,labels
0,A squirrel's front teeth grew continuously thr...,"[a, squirrel, s, front, teeth, grew, continuou...","[O, B-ANIMAL, O, O, O, O, O, O, O, O, O, O, O,..."
1,A crimson cow is fishing peacefully on a branch.,"[a, crimson, cow, is, fishing, peacefully, on,...","[O, O, B-ANIMAL, O, O, O, O, O, O]"
2,A obese boy is running at a butterfly.,"[a, obese, boy, is, running, at, a, butterfly]","[O, O, O, O, O, O, O, B-ANIMAL]"
3,A turquoise cow is sneaking at a butterfly.,"[a, turquoise, cow, is, sneaking, at, a, butte...","[O, O, B-ANIMAL, O, O, O, O, B-ANIMAL]"
4,"The farmer has a cow, a horse, and several sheep.","[the, farmer, has, a, cow, a, horse, and, seve...","[O, O, O, O, B-ANIMAL, O, B-ANIMAL, O, O, B-AN..."


## Save full dataset

In [19]:
save_json(df, OUTPUT_DIR / 'ner_dataset.json')

2025-07-04 13:15:29,601 - INFO - Saved dataset to ..\data\texts\ner_dataset.json (1246 records)


## Split the dataset into Train, Validation, and Test Sets

In [20]:
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=7)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=7)

save_json(train_df, OUTPUT_DIR / 'train_ner.json')
save_json(val_df, OUTPUT_DIR / 'val_ner.json')
save_json(test_df, OUTPUT_DIR / 'test_ner.json')

logger.info(f"Saved: {len(train_df)} train, {len(val_df)} val, {len(test_df)} test samples.")

2025-07-04 13:16:18,412 - INFO - Saved dataset to ..\data\texts\train_ner.json (872 records)
2025-07-04 13:16:18,419 - INFO - Saved dataset to ..\data\texts\val_ner.json (187 records)
2025-07-04 13:16:18,429 - INFO - Saved dataset to ..\data\texts\test_ner.json (187 records)
2025-07-04 13:16:18,430 - INFO - Saved: 872 train, 187 val, 187 test samples.
