In [5]:
import json
import numpy as np
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("aashay96/indic-gpt")
model = AutoModelForCausalLM.from_pretrained("aashay96/indic-gpt")


In [8]:
verbs = pd.DataFrame({"word": ["खेलना", "हँसना", "सुनना"]})
nouns = pd.DataFrame({"word": ["बच्चा", "पेड़", "पानी"]})
adjectives = pd.DataFrame({"word": ["खुश", "सुंदर", "बहादुर"]})
endings = ["सुखद", "शांत", "मित्रता"]


In [9]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [10]:
verbs = pd.DataFrame({"word": ["खेलना", "हँसना", "सुनना"]})
nouns = pd.DataFrame({"word": ["बच्चा", "पेड़", "पानी"]})
adjectives = pd.DataFrame({"word": ["खुश", "सुंदर", "बहादुर"]})
endings = ["सुखद", "शांत", "मित्रता"]


In [11]:
def generate_prompt():
    verb = verbs.sample(n=1).iloc[0, 0]
    noun = nouns.sample(n=1).iloc[0, 0]
    adjective = adjectives.sample(n=1).iloc[0, 0]
    ending = np.random.choice(endings)
    word_choice = np.random.choice(["verb", "noun", "adjective"])
    word = verb if word_choice == "verb" else noun if word_choice == "noun" else adjective

    prompt_text = (
        f"एक सरल हिंदी बाल कथा लिखिए जो 5-6 पंक्तियों में हो, पंचतंत्र जैसी हो "
        f"और आसान भाषा में हो। कहानी में एक शुरुआत, बीच और अंत होना चाहिए "
        f"जिसमें {word_choice} '{word}' को स्वाभाविक रूप से उपयोग किया गया हो। "
        f"कहानी का अंत एक {ending} भावना के साथ होना चाहिए।"
    )
    return {"instruction": prompt_text, "word": word, "ending": ending}



In [15]:
def store_dataset(n):
    data = {"output": [], "input": [], "instruction": []}
    for _ in range(n):
        prompt = generate_prompt()
        prompt_text = prompt['instruction']
        
        # Generate story using IndicGPT
        response = generator(prompt_text, max_length=1000, temperature=0.2, top_p=0.8)[0]['generated_text']
        
        # Collect results
        input_words = [prompt['word'], prompt['ending']]
        data["output"].append(response)
        data["input"].append(input_words)
        data["instruction"].append(prompt_text)
        
        # Save periodically to JSON file
        with open('hindi_stories1.json', 'w', encoding='utf-8') as json_file:
            json.dump({"train": data}, json_file, ensure_ascii=False, indent=4)



In [16]:
store_dataset(5)