In [None]:
# Cell 2: Import libraries and load dataset

import tensorflow as tf
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset from Hugging Face Hub
dataset = load_dataset("KisanVaani/agriculture-qa-english-only", split='train')

print(f"Total samples in dataset: {len(dataset)}")
print(dataset[0])


In [None]:
# Cell 3: Prepare train and validation splits using datasets library

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

train_data = split_dataset['train']
val_data = split_dataset['test']

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")


In [None]:
# Cell 4: Load tokenizer and define preprocessing

MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

MAX_LEN_INPUT = 64
MAX_LEN_OUTPUT = 64

def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id):
    shifted_input_ids = np.zeros_like(input_ids)
    shifted_input_ids[:, 0] = decoder_start_token_id
    shifted_input_ids[:, 1:] = input_ids[:, :-1]
    shifted_input_ids[input_ids == pad_token_id] = pad_token_id
    return shifted_input_ids.tolist()

def preprocess_function(examples):
    inputs = ["question: " + q for q in examples["question"]]
    targets = examples["answers"]  # or 'answer' based on your dataset
    model_inputs = tokenizer(inputs, max_length=MAX_LEN_INPUT, truncation=True, padding="max_length")

    labels = tokenizer(targets, max_length=MAX_LEN_OUTPUT, truncation=True, padding="max_length")
    labels_ids = labels["input_ids"]

    labels_ids = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels_ids]
    model_inputs["labels"] = labels_ids

    decoder_input_ids = shift_tokens_right(
        np.array(labels["input_ids"]),
        pad_token_id=tokenizer.pad_token_id,
        decoder_start_token_id=tokenizer.pad_token_id
    )
    model_inputs["decoder_input_ids"] = decoder_input_ids

    return model_inputs

train_dataset = train_data.map(preprocess_function, batched=True)
val_dataset = val_data.map(preprocess_function, batched=True)

train_dataset.set_format(type="tensorflow", columns=['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
val_dataset.set_format(type="tensorflow", columns=['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])


In [None]:
# Cell 5: Convert Hugging Face datasets to tf.data.Dataset, including decoder_input_ids

def to_tf_dataset(hf_dataset):
    features = {
        "input_ids": np.array(hf_dataset["input_ids"]),
        "attention_mask": np.array(hf_dataset["attention_mask"]),
        "decoder_input_ids": np.array(hf_dataset["decoder_input_ids"]),
    }
    labels = np.array(hf_dataset["labels"])
    return tf.data.Dataset.from_tensor_slices((features, labels))

batch_size = 8  # Adjust this based on your CPU resources

tf_train_dataset = to_tf_dataset(train_dataset).shuffle(100).batch(batch_size)
tf_val_dataset = to_tf_dataset(val_dataset).batch(batch_size)

print("Training dataset batches:", tf_train_dataset)
print("Validation dataset batches:", tf_val_dataset)


In [None]:
# Cell 6: Load model and compile

model = TFT5ForConditionalGeneration.from_pretrained(MODEL_NAME, from_pt=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)

def masked_loss(y_true, y_pred):
    pad_token_id = tf.constant(tokenizer.pad_token_id, dtype=y_true.dtype)
    y_true_safe = tf.where(y_true == -100, pad_token_id, y_true)
    loss_ = tf.keras.losses.sparse_categorical_crossentropy(y_true_safe, y_pred, from_logits=True)
    mask = tf.cast(tf.not_equal(y_true, -100), dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)


model.compile(optimizer=optimizer, loss=masked_loss)


In [None]:
# Cell 7: Train model

epochs = 3

history = model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=epochs)

# Plot training/validation loss
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# Cell 8: Save model and tokenizer

model.save_pretrained("./agri_chatbot_t5")
tokenizer.save_pretrained("./agri_chatbot_t5")


In [None]:
# Cell 9: Define inference function and test

def generate_answer(question, max_length=64):
    input_text = "question: " + question
    input_ids = tokenizer.encode(input_text, return_tensors="tf", max_length=MAX_LEN_INPUT, truncation=True, padding="max_length")
    outputs = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

test_question = "what is organic farming?"
print("Q:", test_question)
print("A:", generate_answer(test_question))


In [None]:
%%writefile app.py
import streamlit as st
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

MODEL_PATH = "./agri_chatbot_t5"

@st.cache_resource
def load_model():
    tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
    model = TFT5ForConditionalGeneration.from_pretrained(MODEL_PATH, from_pt=True)
    return tokenizer, model

tokenizer, model = load_model()

def generate_answer(question, max_length=64):
    input_text = "question: " + question
    input_ids = tokenizer.encode(input_text, return_tensors="tf", truncation=True, padding="max_length", max_length=64)
    outputs = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

st.title("🌱 Agriculture Chatbot - T5")
st.write("Ask any question about agriculture and get accurate responses!")

user_input = st.text_input("Enter your question:")

if st.button("Get Answer"):
    if user_input.strip():
        answer = generate_answer(user_input)
        st.success(f"Answer: {answer}")
    else:
        st.warning("Please enter a valid question.")


In [None]:
question = "what is fertilizers?"
answer = generate_answer(question)
print("Q:", question)
print("A:", answer)


In [None]:
import getpass
token = getpass.getpass('Enter your GitHub Personal Access Token: ')

repo_url = f"https://{token}@github.com/nellyiya/CHATBOT.git"

!git config --global user.email "n.iyabikoze@alustudent.com"
!git config --global user.name "nellyiya"

!git clone {repo_url}
%cd CHATBOT


In [None]:
# Install git lfs if not already installed
!git lfs install

# Track .h5 and other large files in your model folder
!git lfs track "agri_chatbot_t5/*.h5"
!git lfs track "agri_chatbot_t5/*.bin"
!git add .gitattributes
!git commit -m "Add git-lfs tracking for large model files"


In [None]:
!ls


In [None]:

!cp -r /content/agri_chatbot_t5 .
!cp /content/app.py .


In [None]:
!git add .
!git commit -m ", model (LFS), and Streamlit UI"
!git push origin main
