In [None]:
import os
import json
import torch
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
import sys
# For Jupyter notebooks, use the current working directory
notebook_dir = os.getcwd()
sys.path.append(notebook_dir)
# ============================================================
#                        LOAD ENV + LLM
# ============================================================
load_dotenv()
HF_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
hf_llm = HuggingFaceEndpoint(
    repo_id=MODEL_NAME,
    task="text-generation",
    max_new_tokens=1024,
    temperature=0.1,
    top_p=0.9,
)

chatmodel = ChatHuggingFace(
    llm=hf_llm,
    temperature=0.1
)

# ============================================================
#                  LOAD PROMPT TEMPLATE FROM JSON
# ============================================================
def load_prompt_template(json_path: str, key: str) -> PromptTemplate:
    with open(json_path, "r") as f:
        data = json.load(f)

    return PromptTemplate(
        template=data[key],
        input_variables=[
            "location_state",
            "location_district",
            "recommended_crop",
            "irrigation_type",
            "soil_nitrogen",
            "soil_phosphorus",
            "soil_potassium",
            "soil_ph",
            "weather_forecast",
            "weather_summary"
        ]
    )

# ============================================================
#                       TEXT GENERATION
# ============================================================
def generate_crop_report(
    location_state,
    location_district,
    recommended_crop,
    irrigation_type,
    soil_nitrogen,
    soil_phosphorus,
    soil_potassium,
    soil_ph,
    weather_forecast,
    weather_summary=""
):

    prompt_template = load_prompt_template(
        "/data1/home/anumalas/GENAI-PROJECT/UI/Language-translation/prompt_template.json",
        "crop_report_prompt"
    )

    final_prompt = prompt_template.format(
        location_state=location_state,
        location_district=location_district,
        recommended_crop=recommended_crop,
        irrigation_type=irrigation_type,
        soil_nitrogen=soil_nitrogen,
        soil_phosphorus=soil_phosphorus,
        soil_potassium=soil_potassium,
        soil_ph=soil_ph,
        weather_forecast=weather_forecast,
        weather_summary=weather_summary
    )

    result = chatmodel.invoke(final_prompt)
    return result.content.strip()


# ============================================================
#                       BHASHINI SETUP
# ============================================================
model_name = "ai4bharat/indictrans2-en-indic-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Mapping Hindi → Target Scripts (transliteration)
lang_script_map = {
    "hin_Deva": ("en","hi"),
    "tam_Taml": ("hi","ta"),
    "tel_Telu": ("hi","te"),
    "kan_Knda": ("hi","kn"),
    "mal_Mlym": ("hi","ml"),
    "ben_Beng": ("hi","bn"),
    "pan_Guru": ("en","pa"),
    "mar_Deva": ("hi","mr"),
    "guj_Gujr": ("hi","gu"),
    "ory_Orya": ("hi","or"),
    "asm_Beng": ("hi","as"),
    "san_Deva": ("hi","sa"),
    "npi_Deva": ("hi","ne"),
    "gom_Deva": ("hi","ks"),
    "kas_Arab": ("hi","ur"),
    "kas_Deva": ("hi","ks"),
    "snd_Arab": ("hi","ur"),
    "snd_Deva": ("hi","sa"),
    "urd_Arab": ("hi","ur")
}

from transformers import AutoTokenizer
import textwrap

def chunk_text(text, tokenizer, src_lang, tgt_lang, max_chunk_tokens=400):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        chunk_text = " ".join(current_chunk)

        # Format for IndicTrans2
        wrapped = f"{src_lang} {tgt_lang} {chunk_text}"

        token_count = tokenizer(
            wrapped,
            return_tensors="pt",
            truncation=False,
            add_special_tokens=True
        )["input_ids"].shape[1]

        if token_count > max_chunk_tokens:
            current_chunk.pop()         # remove last word from chunk
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]      # new chunk starts

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def translate_with_bhashini(text: str, src_lang="eng_Latn", tgt_lang="hin_Deva"):
    """Chunk-based translation supporting 2000+ words."""

    # Step 1: split into chunks
    chunks = chunk_text(text, tokenizer, max_chunk_tokens=400)

    translated_chunks = []

    for chunk in chunks:
        inp = f"{src_lang} {tgt_lang} {chunk}"
        inputs = tokenizer(inp, return_tensors="pt").to(device)

        gen_cfg = GenerationConfig(
            use_cache=False,
            do_sample=False,
            max_new_tokens=1024,
            num_beams=1
        )

        with torch.no_grad():
            output_ids = model.generate(**inputs, generation_config=gen_cfg)

        translated = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Script transliteration if needed
        if tgt_lang in lang_script_map:
            src_code, tgt_code = lang_script_map[tgt_lang]
            translated = UnicodeIndicTransliterator.transliterate(
                translated, src_code, tgt_code
            )

        translated_chunks.append(translated)

    # Step 3: merge output
    return "\n".join(translated_chunks)


# ============================================================
#                   FINAL ORCHESTRATION FUNCTION
# ============================================================
def generate_final_output(
    target_language_code,
    **crop_inputs,
):
    """
    1. Generate English crop report using LLM
    2. If target language is English → return English
    3. Else → translate using Bhashini
    """
    english_report = generate_crop_report(**crop_inputs)

    # English means NO translation
    if target_language_code == "eng_Latn":
        return english_report

    # Translate with bhashini
    translated = translate_with_bhashini(
        text=english_report,
        src_lang="eng_Latn",
        tgt_lang=target_language_code
    )
    return translated


# ============================================================
#                          USAGE EXAMPLE
# ============================================================
if __name__ == "__main__":

    sample_data = {
        "location_state": "Punjab",
        "location_district": "Ludhiana",
        "recommended_crop": "Wheat",
        "irrigation_type": "Drip Irrigation",
        "soil_nitrogen": 240.5,
        "soil_phosphorus": 45.2,
        "soil_potassium": 180.0,
        "soil_ph": 7.2,
        "weather_forecast": {
            "day1": {"temp_max": 25, "temp_min": 12, "rainfall": 0, "humidity": 65},
            "day2": {"temp_max": 26, "temp_min": 13, "rainfall": 0, "humidity": 60},
            "day3": {"temp_max": 24, "temp_min": 11, "rainfall": 5, "humidity": 70},
            "day4": {"temp_max": 23, "temp_min": 10, "rainfall": 15, "humidity": 75},
            "day5": {"temp_max": 22, "temp_min": 10, "rainfall": 10, "humidity": 72}
        },
        "weather_summary": "Moderate temperatures with occasional rainfall expected."
    }

    # CHANGE THIS TO ANY LANGUAGE: hin_Deva, tel_Telu, tam_Taml, pan_Guru, etc.
    target_lang = "hin_Deva"        # Hindi (example)
    # target_lang = "eng_Latn"      # English (no translation)

    final_output = generate_final_output(
        target_language_code=target_lang,
        **sample_data
    )

    print("\n=========== FINAL OUTPUT ===========\n")
    print(final_output)
    print("\n====================================\n")


  from .autonotebook import tqdm as notebook_tqdm
2025-12-02 20:06:38.718809: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-02 20:06:38.729988: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764686198.743020 3471568 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764686198.747139 3471568 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764686198.757348 3471568 computation_placer.cc:177] computation placer already r

ValueError: not enough values to unpack (expected 3, got 1)