In [10]:
import sys
# print("Python Executable:", sys.executable)
# print("Python Version:", sys.version)

from tools.ml_tools import analyze_user_tool, format_allocation_tool
# from llm_models.llama3_wrapper import get_llama3_llm, get_mistral_llm, get_kimi
from prompt import prompt
from langchain.chains import LLMChain
from langchain.agents import AgentExecutor
from langchain.agents import create_structured_chat_agent

from langchain.schema.output_parser import OutputParserException

import traceback

# Import the necessary callback class
import json
from langchain_core.callbacks import BaseCallbackHandler
from langchain_core.agents import AgentAction, AgentFinish # For type hinting
from uuid import UUID
from typing import Any, Dict, List
from langchain_core.outputs import LLMResult

import torch
from transformers import BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


In [13]:
def get_kimi():
    """
    Load moonshotai/Kimi-K2-Instruct on an NVIDIA A2 (SM 8.6) without FP8.
    Priority: 4-bit NF4 (bitsandbytes). Fallback: fp16.
    """
    import torch
    from transformers import (
        AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig, pipeline
    )
    from langchain_community.llms import HuggingFacePipeline

    model_id = "moonshotai/Kimi-K2-Instruct"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cuda":
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU Name: {torch.cuda.get_device_name(0)}")

    # 1) Load and sanitize config: remove any embedded FP8 quantization_config
    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
    if getattr(config, "quantization_config", None) is not None:
        # Avoid HF's FP8 validator on SM 8.6 devices
        config.quantization_config = None

    # 2) Preferred quantization for A2: 4-bit NF4 with fp16 compute
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,   # A2: fp16 compute is best
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

    def _load_model_4bit():
        return AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
            config=config,                      # uses our sanitized config
            quantization_config=bnb_config,     # forces bnb path (no FP8)
            device_map="auto",
            low_cpu_mem_usage=True,
            attn_implementation="sdpa",         # often fastest on recent PyTorch
        )

    def _load_model_fp16():
        return AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
            config=config,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True,
            attn_implementation="sdpa",
        )

    # 3) Try 4-bit first; if that fails (e.g., bnb/CUDA mismatch), fallback to fp16
    try:
        model = _load_model_4bit()
    except Exception as e:
        print(f"[WARN] 4-bit load failed ({e}). Falling back to fp16...")
        model = _load_model_fp16()

    # 4) Text-generation pipeline
    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=300,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        repetition_penalty=1.1,
        # You can add eos_token_id if your app uses a custom stop token
        # eos_token_id=tokenizer.eos_token_id,
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    return llm


In [14]:
llm = get_kimi()

CUDA version: 12.1
GPU Name: NVIDIA A2


A new version of the following files was downloaded from https://huggingface.co/moonshotai/Kimi-K2-Instruct:
- configuration_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
You are using a model of type kimi_k2 to instantiate a model of type deepseek_v3. This is not supported for all configurations of models and can yield errors.
A new version of the following files was downloaded from https://huggingface.co/moonshotai/Kimi-K2-Instruct:
- tokenization_kimi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/moonshotai/Kimi-K2-Instruct:
- modeling_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a 

[WARN] 4-bit load failed ('NoneType' object has no attribute 'get'). Falling back to fp16...


AttributeError: 'NoneType' object has no attribute 'get'

In [3]:
class CleanAgentStepCallbackHandler(BaseCallbackHandler):
    def on_llm_start(self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any) -> None:
        """Run when LLM starts running."""
        print("\n--- 💡 LLM START ---")
        print(f"--- Prompts sent to LLM ({len(prompts)} total):")
        for i, prompt_text in enumerate(prompts):
            # For ChatPromptTemplate, prompts[0] might be a string representation of all messages
            # For direct message lists, it's simpler.
            # print(f"--- Prompt {i+1}:\n{prompt_text}")
            # A more robust print for ChatPromptTemplate structure (if it passes a list of BaseMessages as prompts)
            if isinstance(prompt_text, str):
                print(f"--- Prompt {i+1} (Raw String):\n{prompt_text}")
            else: # Assuming it might be a list of BaseMessage objects for chat models
                print(f"--- Prompt {i+1} (Messages):")
                for msg in prompt_text:
                    print(f"    - {msg.type.upper()}: {msg.content}")
        print("--- END LLM START ---")

    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
        """Run when LLM ends running."""
        print("\n--- ⚡ LLM END ---")
        print(f"--- Raw LLM Response:")
        if response.generations and response.generations[0] and response.generations[0][0]:
            print(response.generations[0][0].text)
        else:
            print("No text generation found in response.")
        print("--- END LLM END ---")

    def on_agent_action(self, action: AgentAction, **kwargs) -> None:
        """Run on agent action."""
        print("==========================================================================")
        print(type(action), action)
        print(f"\n---Thought: {action.log.strip().split('Action:')[0].replace('Thought:', '').strip()}")
        print(f"---Action: {action.tool}")
        try:
            # Attempt to parse action_input as JSON for cleaner display
            action_input_json = json.dumps(action.tool_input, indent=2)
            print(f"---Action Input:\n{action_input_json}\n ====action_input_type{type(action.tool_input)}")
        except TypeError:
            print(f"---Action Input: {action.tool_input}")

    def on_tool_start(self, serialized: dict, input_str: str, run_id: UUID, **kwargs) -> None:
        """Run on tool start."""
        print("which toollll=====",{serialized},"runID==========",{run_id})
        print(f"--- 📊 Start input for tool:\n{input_str}") # Print raw output for clarity

    def on_tool_end(self, output: str, **kwargs) -> None:
        """Run on tool end."""
        print(f"--- 📊 Observation:\n{output}") # Print raw output for clarity

    def on_agent_finish(self, finish: AgentFinish, **kwargs) -> None:
        """Run on agent finish."""
        print("\n--- ✅ Agent Finished ---")
        print(f"--- 🏁 Final Answer:\n{finish.return_values['output']}")
        print("-------------------------\n")

In [2]:
llm = get_kimi()

CUDA version: 12.1
GPU Name: NVIDIA A2


You are using a model of type kimi_k2 to instantiate a model of type deepseek_v3. This is not supported for all configurations of models and can yield errors.


ValueError: FP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `8.6`

In [6]:
tools = [
    analyze_user_tool,
    format_allocation_tool
]

In [7]:
agent = create_structured_chat_agent(
    llm = llm,
    tools = tools,
    prompt = prompt,
    stop_sequence=["\nObservation:"]
)

    

In [79]:
type(llm)
from langchain_huggingface import ChatHuggingFace
chat_model = ChatHuggingFace(llm=llm)

In [65]:
from langchain_core.prompts import PromptTemplate
template = '''Answer the following questions as best you can. You have access to the following tools:
{tools}
Use the following format:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin!
Question: {input}
Thought:{agent_scratchpad}'''

prompt2 = PromptTemplate.from_template(template)

In [97]:
from langgraph.prebuilt import ToolNode
from langchain_core.tools import tool

def analyze_user_tool(surplus: int, risk_profile: str):
    """
    Analyze user's financial situation and suggest asset allocation.
    """

    if risk_profile.lower() == "high":
        allocation = {"equity": 70, "debt": 20, "gold": 10}
    elif risk_profile.lower() == "moderate":
        allocation = {"equity": 50, "debt": 40, "gold": 10}
    else:
        allocation = {"equity": 30, "debt": 60, "gold": 10}

    result_dict = {"surplus": surplus, "allocation": allocation}
    print(f"Returning this from custom AUT======{type(result_dict)}")
    return result_dict

tool_call = {
    "type": "tool_call",
    "id": "1",
    "args": {"surplus":60000, "risk_profile":"moderate"}
}

In [98]:
tool_node = ToolNode([analyze_user_tool])

In [99]:
from langchain.chat_models import init_chat_model
model = init_chat_model(model="claude-3-5-haiku-latest")
model_with_tools = model.bind_tools([analyze_user_tool])

In [100]:
# model_with_tools = chat_model.bind_tools([analyze_user_tool])
# response_message = model_with_tools.invoke("I have ₹60,000 and moderate risk. How should I allocate my money?")

# tool_node.invoke({"messages" : [response_message]})

In [101]:
from langgraph.graph import StateGraph, MessagesState, START, END
def should_continue(state: MessagesState):
    messages = state["messages"]
    last_message = messages[-1]
    if last_message.tool_calls:
        return "tools"
    return END

def call_model(state: MessagesState):
    messages = state["messages"]
    response = model_with_tools.invoke(messages)
    return {"messages": [response]}

builder = StateGraph(MessagesState)

# Define the two nodes we will cycle between
builder.add_node("call_model", call_model)
builder.add_node("tools", tool_node)

builder.add_edge(START, "call_model")
builder.add_conditional_edges("call_model", should_continue, ["tools", END])
builder.add_edge("tools", "call_model")

graph = builder.compile()

graph.invoke({"messages": [{"role": "user", "content": "I have ₹60,000 and moderate risk. How should I allocate my money?"}]})

TypeError: "Could not resolve authentication method. Expected either api_key or auth_token to be set. Or for one of the `X-Api-Key` or `Authorization` headers to be explicitly omitted"

In [78]:
from langgraph.prebuilt import create_react_agent
agent2 = create_react_agent(
    model = chat_model,
    tools = [analyze_user_tool]
)
agent2.invoke({"messages": [{"role": "user", "content": "I have ₹60,000 and moderate risk. How should I allocate my money?"}]})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'messages': [HumanMessage(content='I have ₹60,000 and moderate risk. What should I do?', additional_kwargs={}, response_metadata={}, id='edcec2dd-e8cc-4e4a-94a5-4c2699fa2dfa'),
  AIMessage(content="<s> [INST] I have ₹60,000 and moderate risk. What should I do? [/INST] If you have ₹60,000 and are willing to take on a moderate level of risk, there are several investment options available to you. Here are some suggestions:\n\n1. Stocks: Investing in the stock market can be a good option if you're comfortable with taking on some risk. You could consider investing in a diversified portfolio of stocks or mutual funds that track the performance of the stock market as a whole. However, it's important to remember that the value of your investments can fluctuate over time, so it's always a good idea to invest with caution.\n2. Real Estate: Real estate can be a lucrative investment option, but it also carries a higher level of risk. You could consider buying a property in a popular area or inves

In [67]:
tool_calls = [{"name": "analyze_user_tool", "args": {"surplus": 60000, "risk_profile": "moderate"}, "id": "1", "type": "tool_call"}]
result = tool_node.invoke(tool_calls)
result

Returning this from custom AUT{'surplus': 60000, 'allocation': {'equity': 50, 'debt': 40, 'gold': 10}},<class 'str'>


{'messages': [ToolMessage(content='{"surplus": 60000, "allocation": {"equity": 50, "debt": 40, "gold": 10}}', name='analyze_user_tool', tool_call_id='1')]}

In [68]:
from langgraph.prebuilt import create_react_agent
agent2 = create_react_agent(
    model = chat_model,
    tools = tool_node,
    prompt="You are a helpful assistant",
)

In [72]:
from langchain_core.messages import HumanMessage
user_query = "I have ₹60,000 and moderate risk. What should I do?"
inputs = {"messages": [HumanMessage(content=user_query)]}

In [70]:
try:
    final_state = agent2.invoke(inputs)

    if final_state and "messages" in final_state and final_state["messages"]:
        last_message = final_state["messages"][-1]
        if hasattr(last_message, 'content'):
            print("Final Answer (from last message):")
            print(last_message.content)
        else:
            print("Final state messages:", final_state["messages"])
    else:
        print("Agent finished, but no explicit 'messages' in final state:", final_state)

except Exception as e:
    print(f"An error occurred during invocation: {e}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Final Answer (from last message):
<s> [INST] You are a helpful assistant

I have ₹60,000 and moderate risk. What should I do? [/INST] If you have ₹60,000 and are willing to take on moderate risk, there are several investment options available to you:

1. Stocks: Investing in the stock market can be a good option if you are willing to take on some risk. The stock market has historically provided higher returns over the long term, but it is also more volatile in the short term.
2. Mutual Funds: Mutual funds are a type of investment vehicle that pools money from multiple investors to invest in a diversified portfolio of stocks, bonds, or other assets. They offer a lower risk profile compared to individual stocks, as they provide exposure to a range of asset classes.
3. Real Estate: Investing in real estate can be another option for those willing to take on some risk. This could involve buying a property to rent out, investing in a real estate investment trust (REIT), or investing in a cro

In [50]:
# inputs = {"messages": [{"role": "user", "content": "what is the weather in sf"}]}
# for chunk in agent2.stream(inputs, stream_mode="updates"):
#     print(chunk)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'agent': {'messages': [AIMessage(content="<s> [INST] You are a helpful assistant\n\nwhat is the weather in sf [/INST] I'm sorry, but I cannot provide you with real-time information about the weather in San Francisco. However, you can check a reliable weather website or app for up-to-date information on the current weather conditions and forecasts for San Francisco.", additional_kwargs={}, response_metadata={}, id='run--6c399ccf-791b-4acf-a887-26fa725e1e5e-0')]}}


In [47]:
# executor = AgentExecutor.from_agent_and_tools(
#     agent=agent,
#     tools=tools,
#     verbose=False,
#     handle_parsing_errors=True,
#     callbacks=[CleanAgentStepCallbackHandler()]
# )

executor2 = AgentExecutor(
    agent=agent2,
    tools=tools,
    verbose=False,
    handle_parsing_errors=True,
    return_intermediate_steps=True,
    callbacks=[CleanAgentStepCallbackHandler()]
)

In [48]:
executor2.invoke({"input": "Where is the hometown of the 2007 US PGA championship winner and his score?"})

ValueError: At least one HumanMessage must be provided!

In [39]:
# Step 4: Start interaction loop
try:
    response = executor2.invoke({
        "input": "I have ₹60,000 and moderate risk. What should I do?"
    })
    print(f"\n🤖 Advisor: {response['output']}\n")
except Exception as e:
    print("❌ An unexpected error occurred while processing your request.")
    print(traceback.format_exc())

❌ An unexpected error occurred while processing your request.
Traceback (most recent call last):
  File "/tmp/ipykernel_4076267/136789718.py", line 3, in <module>
    response = executor2.invoke({
  File "/users/trgl1183/miniconda3/envs/test_env/lib/python3.9/site-packages/langchain/chains/base.py", line 165, in invoke
    self._call(inputs, run_manager=run_manager)
  File "/users/trgl1183/miniconda3/envs/test_env/lib/python3.9/site-packages/langchain/agents/agent.py", line 1625, in _call
    next_step_output = self._take_next_step(
  File "/users/trgl1183/miniconda3/envs/test_env/lib/python3.9/site-packages/langchain/agents/agent.py", line 1325, in _take_next_step
    list(
  File "/users/trgl1183/miniconda3/envs/test_env/lib/python3.9/site-packages/langchain/agents/agent.py", line 1352, in _iter_next_step
    output = self._action_agent.plan(
  File "/users/trgl1183/miniconda3/envs/test_env/lib/python3.9/site-packages/langchain/agents/agent.py", line 455, in plan
    for chunk in sel