In [1]:
!pip install -q keras_nlp

In [None]:
!pip install -U "huggingface_hub[cli]"

In [1]:
import tensorflow as tf
import keras_nlp  # A Keras-based library for natural language processing tasks.
from tensorflow import keras
# Mixed Precision Training:
# This enables the model to use both 16-bit and 32-bit floating-point types.
# Using float16 for most operations reduces memory usage and speeds up computation,
# while keeping some operations in float32 maintains stability.
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# ------------------------------
# Load the Pre-trained Ollama Model
# ------------------------------
print("Loading model (this may take a while)...")
# This command loads a pre-trained language model named Ollama coder from Hugging Face.
# "Causal" means the model generates text in a sequential, left-to-right manner.
Ollama_coder_model = keras_nlp.models.FalconCausalLM.from_preset(
    "hf://keras/falcon_refinedweb_1b_en"
)
print("Model loaded.")

# Display the structure of the model, including layers and number of parameters.
Ollama_coder_model.summary()

# ------------------------------
# Enable LoRA Fine-Tuning
# ------------------------------
# LoRA (Low-Rank Adaptation) is a technique to efficiently fine-tune large models.
# Instead of updating every parameter in the model (which can be millions or billions),
# LoRA adds smaller matrices with a much lower rank (here, rank=2) to approximate the needed adjustments.
# Think of it as fine-tuning by "tweaking" only a few parameters instead of re-writing a whole book.
Ollama_coder_model.backbone.enable_lora(rank=2)
print("Enabled LoRA for efficient fine-tuning with reduced rank.")




Loading model (this may take a while)...
Model loaded.


Enabled LoRA for efficient fine-tuning with reduced rank.


In [3]:
# ------------------------------
# Prepare Training Data
# ------------------------------
# Here, we define a small dataset with pairs of symptoms and corresponding diseases.
# Each string follows the format:
# "Symptom: <list of symptoms>.\nDisease: <disease name>."
# The "\n" is a newline character that separates the symptoms from the disease.
# train_data = [
#     "Symptom: persistent cough, fever, difficulty breathing.\nDisease: Pneumonia.",
#     "Symptom: severe headache, neck stiffness, photophobia.\nDisease: Meningitis.",
#     "Symptom: sudden weakness on one side, slurred speech.\nDisease: Stroke.",
#     "Symptom: increased thirst, frequent urination, unexplained weight loss.\nDisease: Diabetes.",
#     "Symptom: joint pain, prolonged morning stiffness, swelling in multiple joints.\nDisease: Rheumatoid Arthritis."
# ]

# train_data = [
#     "Line Item: Starbucks, $5.67, 2025-02-28, Coffee Shop.\nLabel: Not Fraud.",
#     "Line Item: Unknown Merchant, $1200.00, 2025-02-27, Electronics.\nLabel: Fraud.",
#     "Line Item: Walmart Supercenter, $45.32, 2025-02-26, Groceries.\nLabel: Not Fraud.",
#     "Line Item: Luxury Boutique, $2200.00, 2025-02-28, Designer Clothing.\nLabel: Fraud.",
#     "Line Item: Uber, $18.75, 2025-02-27, Ride Share.\nLabel: Not Fraud."
# ]

train_data = [
    "Question: What is the total quantity (in KG) of all items sold across all companies in the dataset? \nPandas output: df[df['Unit'] == 'KG']['Quantity'].sum()",
    "Question: What is the total value (in VND) of all transactions in the dataset? \nPandas output: (df[df.Currency == 'VND']['Value']).sum()",
    "Question: How many unique companies are represented in the sales data? \nPandas output: df['Company_Name'].nunique()",
    "Question: What is the total value of sales in USD across all transactions? \nPandas output: df[df['Currency'] == 'USD']['Value'].sum()",
    "Question: What is the total quantity of items sold in tons (TAN) across all companies? \nPandas output:df['Quantity'].sum()/1000",
    "Question: What is the total value of sales (in VND) for Công Ty TNHH Tân Thời in February 2019? \nPandas output:df[(df['Company_Name'] == 'Công Ty TNHH Tân Thời') & (df['Billing Date'].dt.month == 2) & (df['Billing Date'].dt.year == 2019)]['Value'].sum()",
    "Question: How many transactions did Công Ty TNHH Sản Xuất Cân Nhơn Hòa record in the dataset? \nPandas output:df[df['Company_Name'] == 'Công Ty TNHH Sản Xuất Cân Nhơn Hòa']['Billing Date'].count()",
    "Question: What is the total quantity (in KG) of items sold by Công Ty TNHH Một Thành Viên Quang Min? \nPandas output:df[(df['Company_Name'] == 'Công Ty TNHH Một Thành Viên Quang Min') & (df['Unit'] == 'KG')]['Quantity'].sum()",
    "Question: What is the highest-value transaction (in VND) for Công Ty TNHH BOSEUNG VINA? \nPandas output: [(df['Company_Name'] == 'Công Ty TNHH Một Thành Viên Quang Min') & (df['Currency'] == 'VND')].max()['Value']",
    "Question: How much did G.I IMPORT EXPORT CO.,LTD spend in USD on transactions dated February 15, 2019?d\nPandas output: f[(df['Company_Name'] == 'G.I IMPORT EXPORT CO.,LTD') & (df['Billing Date'].dt.month == 2) & (df['Currency']== 'USD')]['Value'].sum()`",
    "Question: What is the total quantity (in KG) of the item GI 0.65x61/43xC sold by all companies?\nPandas output: df[df['Item_Details'].str.contains('GI 0.65x61/43xC')]['Quantity'].sum()",
    "Question: What is the average unit price (VND per KG) of the item PO V 3.0x30x30x3005xC?\nPandas output: df[df['Item_Details'].str.startswith('PO V 3.0x30x30x3005xC')]['Value'].sum() / df[df['Item_Details'].str.startswith('PO V 3.0x30x30x3005xC')]['Quantity'].sum()",
    "Question: How many transactions involve the item CR 0.5 x 384 x 734 x 10.000 Tấm?\nPandas output: df[df['Item_Details'] == 'CR 0.5x384x734x10,000Tấm'].shape[0]",
    "Question: What is the total value (in VND) of all sales of Thép Mã Kẽm SGCC 0.65x61/43xC?\nPandas output: df[df['Item_Details'].str.contains('Thép Mã Kẽm SGCC 0.65x61/43xC')]['Value'].sum()",
    "Question: Which company purchased the most SPCC 0.4 XẢ BĂNG by quantity (in KG)?\nPandas output: df[df['Item_Details'].str.contains('SPCC 0.4 XẢ BĂNG')]['Company_Name'].value_counts().idxmax()",
]


# ------------------------------
# Compile the Model
# ------------------------------
# Before training, the model is compiled by specifying:
# - A loss function: Measures how well the model's predictions match the actual labels.
# - An optimizer: Determines how the model's weights are updated during training.
# - Metrics: Additional measurements to judge performance (here, accuracy).
Ollama_coder_model.compile(
    # SparseCategoricalCrossentropy is used when you have multiple classes and your labels are integers.
    # "from_logits=True" indicates that the model's outputs are raw values (logits), not probabilities.
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    # Adam optimizer is chosen for its ability to adjust learning rates during training.
    # It combines ideas from momentum and adaptive learning rate techniques.
    optimizer=keras.optimizers.Adam(learning_rate=5e-5),
    # SparseCategoricalAccuracy computes the percentage of correct predictions.
    metrics=[keras.metrics.SparseCategoricalAccuracy()]
)

# ------------------------------
# Fine-Tune the Model
# ------------------------------
print("Starting fine-tuning...")
# The model is fine-tuned (trained) on the provided training data.
# Fine-tuning adjusts the model's weights to specialize in the new task (mapping symptoms to diseases).
# A batch size of 1 is used, meaning one training sample is processed at a time.
# The training runs for 10 epochs, meaning the model sees the entire dataset 10 times.
Ollama_coder_model.fit(train_data, batch_size=1, epochs=1)
print("Fine-tuning complete.")



Starting fine-tuning...
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 23s/step - loss: 0.1783 - sparse_categorical_accuracy: 0.0242 - weighted_sparse_categorical_accuracy: 0.4137
Fine-tuning complete.


In [4]:
# ------------------------------
# Save the Falcon Model
# ------------------------------
# After training, the model is saved in the recommended .keras format.
# This allows you to reuse the model later without retraining.
Ollama_coder_model.save("Finetuned_Falcon_coder_saleproject.keras")
print("Model saved.")



Model saved.


In [3]:
import tensorflow as tf
import keras_nlp  # A Keras-based library for natural language processing tasks.
from tensorflow import keras

# ------------------------------
# Reload the Model for Inference
# ------------------------------
# The saved model is reloaded for performing inference (generating predictions).
Ollama_coder_model = keras.models.load_model("Finetuned_Falcon_coder_saleproject.keras")
print("Model reloaded for inference.")



  instance.compile_from_config(compile_config)
  saveable.load_own_variables(weights_store.get(inner_path))


Model reloaded for inference.


In [4]:
# ------------------------------
# Set Up a Sampler for Text Generation
# ------------------------------
# When generating text, a sampler helps decide the next token (word or subword).
# GreedySampler always selects the token with the highest probability at each step.
sampler = keras_nlp.samplers.GreedySampler()
# The sampler is integrated into the model for use during inference.
Ollama_coder_model.compile(sampler=sampler)

In [7]:
# Generate an answer for a given healthcare-related symptom prompt
# prompt = "Symptom: sudden weakness on one side, slurred speech.\nDisease:"
# prompt = "Line Item: random merchant, $543.67, 2025-02-31, Retail.\nLabel:"
prompt = "Question:  What is the total value of sales (in VND) on February 27, 2019?\n Pandas output: "
result = Ollama_coder_model.generate(prompt, max_length=1000)
print("Pandas output:")
print(result)

Pandas output:
Question:  What is the total value of sales (in VND) on February 27, 2019?
 Pandas output: 
- The total value of sales (in VND) on February 27, 2019 is 
- The total value of sales (in VND) on February 27, 2019 is 
- The total value of sales (in VND) on February 27, 2019 is 
- The total value of sales (in VND) on February, 2019 is 
- The total value of sales (in VND) on February, 2019 is 
- The total value of sales (in VND) on February, 2019 is 
- The total value of sales (in VND) on February, 2019 is 
- The total value of sales (in VND) on February is 
- The total value of sales (in 2019 is 
- The total sales (in VND) on February is 
- The total sales (in 2019 is 
- The total sales (in VND) on February is 
- The total sales (in 2019 is 
- The total sales (in 2019 is 
- The total sales (in 2019 is 
- The total sales (in 2019 is 
- The total sales (in 2019 is 
- The total sales (in 2019) on February is 
- The total sales (in 2019 is 
- The total sales (in 2019 is 
- sales 