In [1]:
import os
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.llms import openai
from langchain.chains import LLMChain, LLMRouterChain, MultiPromptChain, HypotheticalDocumentEmbedder, RetrievalQA
import dotenv
from langchain_core.prompts import PromptTemplate
from typing import Optional
import json
import pandas as pd
from Text_preprocessing import Text_preprocessing
from langchain_community.document_loaders import DataFrameLoader
from typing import List, Dict, Any, Mapping
from langchain.globals import set_debug

from langchain.chains.router.multi_prompt_prompt import MULTI_PROMPT_ROUTER_TEMPLATE

from langchain_community.vectorstores import chroma as Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.vectorstores import VectorStoreRetriever
from pydantic.v1 import Field
from langchain_core.documents import Document
from langchain_community.document_transformers import (
    LongContextReorder
)
from sentence_transformers import CrossEncoder
import datetime
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity,
    context_entity_recall
)
import llmModels 
import wandb
import pickle
import prompts
import groundTruths
from langchain.embeddings import SentenceTransformerEmbeddings
from FlagEmbedding import FlagReranker

dotenv.load_dotenv()

### PARAMETER CHECKPOINT ####

#Open AI
chatModelAI = ChatOpenAI(temperature=0)

# # # # Llama 2 13 B chat
# chatModel_llama13b = llmModels.loadLlamma()

# # # Mistral 7B chat
# chatModel_mistral7b = llmModels.loadMistral7b()

# #70B
# chatModel_llama70b = llmModels.loadLlama2_70B()

## Llama 3 8B
llm = llmModels.loadLlama3_8B() 

##FSD_1777
dataPath = "/home/mbhatti/mnt/d/LLM-repo1/models/langchain_implementation/FSD1777_Oct23.json"
dateFrom = "2023-10-19T09:00:00+00:00" #2023-10-19T18:58:41Z for 200 tweets
dateTo = "2023-10-19T18:59:00+00:00"


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.34s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
"""Load relevant fields of flood tags api json response"""
def json_dataloader(dataPath = dataPath, dateFrom = dateFrom, dateTo = dateTo):
    # Load json and extract relevant records in pandas df
    with open(dataPath, 'r') as json_file:
        response_dict = json.load(json_file)

    # Convert to pandas df    
    pd.set_option('display.max_colwidth', None)
    df = pd.DataFrame(response_dict)
    df['date'] = pd.to_datetime(df['date'])
    df = df.drop(columns=['id','tag_class', 'source', 'lang', 'urls','locations'])

    #Get data between thresholds
    threshold_datetime_lower = pd.to_datetime(dateFrom)
    threshold_datetime_upper = pd.to_datetime(dateTo)
    df = df[df['date'] >= threshold_datetime_lower]
    df = df[df['date'] <= threshold_datetime_upper]

    #Remove duplicates
    df  = df.drop_duplicates(subset=["text"], keep=False)
    #Pre-process
    preprocess = Text_preprocessing(df)
    df = preprocess.preprocess()
    #Covert date to string
    df['date'] = df['date'].astype(str)
    return df

def bgeEmbeddings():
    model_name = "BAAI/bge-large-en-v1.5"
    # model_name = "BAAI/bge-m3"
    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
    model = HuggingFaceBgeEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return model

def data_embedding(data : list, eModel = "bge-large-en-v1.5", rType = "Query", metric = "L2", lang = "en"):
    """Vectorize the data using OpenAI embeddings and store in Chroma db"""
    if lang == "en":
        if eModel != "bge-large-en-v1.5":
            embeddings = OpenAIEmbeddings()
        else:
            embeddings = bgeEmbeddings()
        
        if (rType == "Hyde"):
            embeddings = hydeEmbedder(embeddings)

    if lang == "ja":
        embeddings = bgeEmbeddings()


    documents = []
    loader = DataFrameLoader(data, page_content_column="text")
    documents.extend(loader.load())

    #Change this -- removal of duplicates
    db = Chroma.Chroma.from_documents(documents,embeddings)
    if db._client.list_collections() != None:
        for collection in db._client.list_collections():
            ids = collection.get()['ids']
            print('REMOVE %s document(s) from %s collection' % (str(len(ids)), collection.name))
            if len(ids): collection.delete(ids)

    #Create a vector store
    if metric == "cosine":
        db = Chroma.Chroma.from_documents(documents,embeddings, collection_metadata={"hnsw:space": "cosine"})
    else:
        db = Chroma.Chroma.from_documents(documents,embeddings)
    print(len(db._collection.get()['ids']))
    return db

In [3]:
# llm = chatModel_llama3_8B
# Load the data from source
# data = dataframe_dataloader()
data = json_dataloader()

# Convert to vector store
vectorstore = data_embedding(data)

retriever = vectorstore.as_retriever(search_kwargs={'k': 30})

REMOVE 603 document(s) from langchain collection
603


In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA
from langchain.agents import Tool
from langchain.chains import LLMChain

# chat completion llm
# llm = ChatOpenAI(
#     model_name='gpt-3.5-turbo',
#     temperature=0.0
# )


llmOpenAI = ChatOpenAI(
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=1,
    return_messages=True,
    output_key='output'
)


#Prompt and chain for Twitter DB-----------------
prompt_template_llama3_loc = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Act as a location extractor and extract all relevant locations with respect to the user question.
<|eot_id|><|start_header_id|>user<|end_header_id|>
Answer the question based on the following context only: 
{context}
Question: {question}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
default_prompt = PromptTemplate(template = prompt_template_llama3_loc, input_variables = ['question', 'context'])

# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": default_prompt},
    verbose = True,
    return_source_documents=True
)


#Prompt and chain for extracting geolocations --------------
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.chains import LLMChain

llm = ChatOpenAI(model_name='gpt-3.5-turbo-1106')

geoLocTemplate = """
Act as geo locator. Extract the geopoint coordinates according to the question in the following json format: 
{{'location':'Location name', 
'latitude' : 12.2,
'longitude' : 2.33
}}

question: {question}
"""

prompt_template = PromptTemplate(
    input_variables=["question"],
    template=geoLocTemplate,
)
# description = "It is software dev firm specifically focusing on automation software"
# prompt_template.format(firm_description=description)
llmGPT40 = ChatOpenAI(
    model_name='gpt-4o',
    temperature=0.0
)

chaingeo = LLMChain(llm=llmGPT40, prompt=prompt_template)   


tools = [
    Tool(
        name='Twitter database',
        func=qa.invoke,
        description=(
            'Use this tool to answer flooding related questions'
        )
    ),
    Tool(
    name='Geo location extraction',
    func=chaingeo.run,
    description=(
        'Use this tool to extract geolocation coordinates'
    )
    )
]

from langchain.agents import initialize_agent

# conv
agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llmOpenAI,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory,
    return_source_documents=True,
    return_intermediate_steps=True
)

  warn_deprecated(
  warn_deprecated(


In [71]:
from langchain.agents import AgentExecutor
agent_executor = AgentExecutor(
    agent=agent, tools=tools, verbose=True, return_intermediate_steps=True
)

In [18]:
# query = """Which locations received flood warnings?"""
query = "What are the geo point locations of these places?"
results = agent(query)
# results = agent_executor.invoke({"input": "Which locations are receiving flood warnings"})
# agent.invoke({"input": "Any deaths reported due to flooding?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Geo location extraction",
    "action_input": "North Sea, North Shields, South Shields, Barbourne (Worcester), River Maun (near Retford), Bridlington (Yorkshire), Scarborough (Yorkshire), Tyne (at North and South Shields), Aberdeenshire, Angus, Dundee, Perth & Kinross, Kintore, Inverurie, Perthshire, Findhorn, Nairn, Moray, Speyside, Brechin, River Don (Aberdeenshire), River Isla (Blairgowrie), Haughton, Milton, West Drayton (Nottinghamshire), Sandsend (North Yorkshire), Glenrothes, River South Esk (Angus), Ireland"
}
```[0m
Observation: [33;1m[1;3m```json
[
    {'location': 'North Sea', 'latitude': 56.0, 'longitude': 3.0},
    {'location': 'North Shields', 'latitude': 55.008, 'longitude': -1.447},
    {'location': 'South Shields', 'latitude': 54.998, 'longitude': -1.432},
    {'location': 'Barbourne (Worcester)', 'latitude': 52.204, 'longitude': -2.224},
    {'location': 'River Maun (near Retford)

In [19]:
print(results)



In [1]:
import re
data = results['intermediate_steps'][0]
print(results['intermediate_steps'][0][0].tool)

#FOR TWITTER DATABASE

#Get response
if results['intermediate_steps'][0][0].tool == "Twitter database":
    print(results['intermediate_steps'][0][1]['result'])

#Get source docs
if results['intermediate_steps'][0][0].tool == "Twitter database":
    print(results['intermediate_steps'][0][1]['source_documents'])

# FOR GEO LOCATION

# Get response
if results['intermediate_steps'][0][0].tool == "Geo location extraction":
    print(results['intermediate_steps'][0][1])



NameError: name 'results' is not defined

In [None]:
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.chains import LLMChain

llm = ChatOpenAI(model_name='gpt-3.5-turbo-1106')

geoLocTemplate = """
Act as geo locator. Extract the geopoint coordinates according to the question in the following json format: 
{{'location':'Location name', 
'latitude' : 12.2,
'longitude' : 2.33
}}

question: {question}
"""

prompt_template = PromptTemplate(
    input_variables=["question"],
    template=geoLocTemplate,
)
# description = "It is software dev firm specifically focusing on automation software"
# prompt_template.format(firm_description=description)
chain = LLMChain(llm=llmOpenAI, prompt=prompt_template)   
# chain = LLMChain(llm=llm, prompt=prompt_template)

print(chain.run("What is geo location of London, Manchester, Brechin and Angus"))