In [67]:
import pandas as pd
import os
import re
import json
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
import nltk
from nltk.stem import WordNetLemmatizer

# Load environment variables from .env file
load_dotenv()

# Retrieve the API key from environment variables
NVIDIA_API_KEY = os.getenv('NVIDIA_API_KEY')

if NVIDIA_API_KEY:
    print("API key loaded successfully.")
else:
    print("API key not found in .env file.")

API key loaded successfully.


In [68]:
popular_animals = [
    "dog", "cat", "horse", "elephant", "butterfly", "chicken", "cow", "sheep",
    "squirrel", "spider", "lion", "tiger", "bear", "wolf", "giraffe", "zebra",
    "rabbit", "fox", "deer", "panda", "kangaroo", "monkey", "dolphin", "shark",
    "whale", "penguin", "eagle", "owl", "parrot", "snake", "turtle", "crocodile",
    "octopus", "jellyfish", "frog", "ant", "bee", "bat", "rat", "mouse",
    "cheetah", "leopard", "raccoon", "hippopotamus", "rhinoceros", "goat", "buffalo",
    "chameleon", "cobra", "peacock", "ostrich", "flamingo", "seal", "walrus",
    "beaver", "skunk", "armadillo", "porcupine", "hedgehog", "chipmunk", "badger",
    "mole", "platypus", "sloth", "meerkat", "lynx", "cougar", "jaguar", "hyena",
    "coyote", "bison", "moose", "reindeer", "elk", "crab", "lobster", "starfish",
    "seahorse", "pufferfish", "stingray", "hammerhead shark", "manta ray",
    "carp", "salmon", "trout", "goldfish", "gecko", "iguana", "komodo dragon",
    "vulture", "falcon", "macaw", "woodpecker", "swallow", "sparrow", "caterpillar",
    "scorpion", "mantis", "grasshopper", "firefly", "beetle", "earthworm"
]

In [69]:
def generate_sentences(api_key: str, model_name: str, animals, output_path="dataset/dataset.json"):
    dataset = []
    rng = np.random.default_rng(seed=123)
    try:
        client = OpenAI(
            base_url="https://integrate.api.nvidia.com/v1",
            api_key=api_key
        )

        for animal in animals:
            n = rng.integers(low=10, high=15)
            prompt = (
                f"Generate {
                    n} realistic, varied, and context-rich sentences that include the animal {animal}. "
                "Each sentence should not be similar to the others, and they should all make sense in real-life contexts. "
                "Do not repeat contexts or patterns, and avoid generic statements. "
                "Include specific details to make each sentence unique and realistic."
                "Do not include comments or other unnecessary information."
                "Write only sentences without numbering"
            )

            completion = client.chat.completions.create(
                model=model_name,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=1024,
                stream=False
            )

            sentences_for_animal = [
                s.strip() for s in completion.choices[0].message.content.split('\n') if s.strip()]
            dataset.append(
                {"animal": animal, "sentences": sentences_for_animal})

        with open(output_path, 'w') as f:
            json.dump(dataset, f, indent=2)
        print(f"Sentences generated and saved to {output_path}")

        return dataset
    except Exception as e:
        print(f"Error generating sentences: {e}")

In [70]:
dataset = generate_sentences(api_key=NVIDIA_API_KEY,
                             model_name="nvidia/llama-3.1-nemotron-70b-instruct",
                             animals=popular_animals)

dataset[0]

Sentences generated and saved to dataset/dataset.json


{'animal': 'dog',
 'sentences': ['The search and rescue team deployed a trained dog named Max to locate the missing hiker in the dense, rain-soaked woods of the Pacific Northwest.',
  'As she walked into the vintage clothing store, a friendly dog named Lola, dressed in a matching polka-dot scarf, wagged its tail behind the counter.',
  'After a long day of filming, the movie star relaxed with her emotional support dog, a calm Poodle named Fifi, in her luxurious trailer on set.',
  "During the physics experiment, the dog's curious nature inadvertently helped students at Springfield High School demonstrate the concept of gravity when it knocked over a ball.",
  'At precisely 6:00 AM, the dog, a sleek German Shepherd named Rocky, began its daily patrol around the perimeter of the secure, government facility.',
  'The new dog park in suburban Chicago, featuring separate areas for large and small breeds, quickly became a hotspot for socializing among dog owners from diverse backgrounds.',
 

In [71]:
def preprocess(dataset):
    # Download WordNet data
    nltk.download('wordnet')
    nltk.download('omw-1.4')

    ner_data = []
    lemmatizer = WordNetLemmatizer()

    for entry in dataset:
        animal = entry["animal"].lower()
        sentences = entry["sentences"]

        for sentence in sentences:
            tokens = re.findall(r'\w+|[^\w\s]', sentence, re.UNICODE)

            lemmatized_tokens = [lemmatizer.lemmatize(
                token).lower() for token in tokens]
            labels = [0] * len(tokens)

            animal_parts = animal.split()
            for i in range(len(lemmatized_tokens)):
                if i + len(animal_parts) <= len(lemmatized_tokens):
                    # Check if we found the animal name (comparing lemmatized forms)
                    if all(lemmatized_tokens[i + j] == lemmatizer.lemmatize(part).lower()
                           for j, part in enumerate(animal_parts)):
                        # Mark first token as 1 (B-tag)
                        labels[i] = 1
                        # Mark remaining tokens as 2 (I-tag)
                        for j in range(1, len(animal_parts)):
                            labels[i + j] = 2

            ner_data.append({
                "tokens": tokens,
                "labels": labels
            })

    return ner_data

In [72]:
labeled_ds = preprocess(dataset=dataset)

df = pd.DataFrame(labeled_ds)
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mmosvlad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mmosvlad/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,tokens,labels
0,"[The, search, and, rescue, team, deployed, a, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,"[As, she, walked, into, the, vintage, clothing...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2,"[After, a, long, day, of, filming, ,, the, mov...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[During, the, physics, experiment, ,, the, dog...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[At, precisely, 6, :, 00, AM, ,, the, dog, ,, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [78]:
len(df[df["labels"].apply(lambda x: sum(x) == 0)])

23

In [79]:
filtered_df = df[df["labels"].apply(lambda x: sum(x) > 0)]
len(filtered_df)

1211

In [80]:
train_df, temp_df = train_test_split(
    filtered_df, test_size=0.3, shuffle=True, random_state=123)

val_df, test_df = train_test_split(
    temp_df, test_size=0.5, shuffle=True, random_state=123)

train_df.to_csv("dataset/train.csv", index=False)
val_df.to_csv("dataset/val.csv", index=False)
test_df.to_csv("dataset/test.csv", index=False)