In [1]:
import pandas as pd
import pickle
from langchain_community.vectorstores import chroma as Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.vectorstores import VectorStoreRetriever
from langchain_community.document_loaders import DataFrameLoader
from langchain.indexes import SQLRecordManager, index
from langchain_core.documents import Document
import json
from Text_preprocessing import Text_preprocessing

In [2]:
##FSD_1777
dataPath = "/home/mbhatti/mnt/d/LLM-repo1/models/langchain_implementation/FSD1777_Oct23.json"
dateFrom = "2023-10-19T09:00:00+00:00" #2023-10-19T18:58:41Z for 200 tweets
dateTo = "2023-10-19T18:00:00+00:00"

"""Load relevant fields of flood tags api json response"""
def json_dataloader(dataPath = dataPath, dateFrom = dateFrom, dateTo = dateTo):
    # Load json and extract relevant records in pandas df
    with open(dataPath, 'r') as json_file:
        response_dict = json.load(json_file)

    # Convert to pandas df    
    pd.set_option('display.max_colwidth', None)
    df = pd.DataFrame(response_dict)
    df['date'] = pd.to_datetime(df['date'])
    df = df.drop(columns=['id','tag_class', 'source', 'lang', 'urls','locations'])

    #Get data between thresholds
    threshold_datetime_lower = pd.to_datetime(dateFrom)
    threshold_datetime_upper = pd.to_datetime(dateTo)
    df = df[df['date'] >= threshold_datetime_lower]
    df = df[df['date'] <= threshold_datetime_upper]

    #Remove duplicates
    df  = df.drop_duplicates(subset=["text"], keep=False)
    #Pre-process
    preprocess = Text_preprocessing(df)
    df = preprocess.preprocess()
    #Covert date to string
    df['date'] = df['date'].astype(str)
    return df

# Load the data from source
data = json_dataloader()

#### Filtering on the data

In [5]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_lg")

In [39]:
# Sample text
text = "All residents of #Brechin town are being told to evacuate due to #StormBabet - with the river through the town expected to rise over the coming hours and cause severe flooding. Local buses will end by 6pm, as the red warning advises against all travel in the area"

# Process the text with spaCy
doc = nlp(text)

# Extract location entities
locations = [ent.text for ent in doc.ents]
# Extract location entities

# Print the extracted locations
print(locations)

['StormBabet', 'the coming hours', '6pm']


In [6]:
#Implementation on the dataframe
# Predefined list of entities to match
predefined_entities = ["Brechin"]

# Function to extract entities from text
def extract_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

# Iterate over the DataFrame and delete rows if entities match
for index, row in data.iterrows():
    entities = extract_entities(row["text"])
    if any(entity in predefined_entities for entity in entities):
        data.drop(index, inplace=True)



In [9]:
# Display the resulting DataFrame
# with pd.option_context('display.max_rows', None,
#                        'display.max_columns', None,
#                        'display.precision', 3,
#                        ):
print(data)

                           date  \
4187  2023-10-19 17:59:34+00:00   
4188  2023-10-19 17:58:01+00:00   
4190  2023-10-19 17:56:11+00:00   
4191  2023-10-19 17:56:10+00:00   
4192  2023-10-19 17:54:48+00:00   
...                         ...   
4737  2023-10-19 09:07:02+00:00   
4738  2023-10-19 09:03:44+00:00   
4739  2023-10-19 09:00:39+00:00   
4740  2023-10-19 09:00:15+00:00   
4741  2023-10-19 09:00:07+00:00   

                                                                                                                                                                                                                                                    text  
4188                                                                                                                                                  Does Rich Sunak have investments in Israel? Shouldn’t he be visiting Braemar and the floods today?  
4190                            The reality of rapid river level rise due to 

#### Vector store ops

In [7]:
def bgeEmbeddings():
    model_name = "BAAI/bge-large-en-v1.5"
    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
    model = HuggingFaceBgeEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return model

In [8]:
embeddings = bgeEmbeddings()

documents = []
loader = DataFrameLoader(data, page_content_column="text")
documents.extend(loader.load())
#Create a vector store
# db = Chroma.Chroma("Langchain collection",embeddings)
db = Chroma.Chroma.from_documents(documents,
                                  embeddings)
if db._client.list_collections() != None:

  for collection in db._client.list_collections():
    ids = collection.get()['ids']
    print('REMOVE %s document(s) from %s collection' % (str(len(ids)), collection.name))
    if len(ids): collection.delete(ids)

db = Chroma.Chroma.from_documents(documents,
                                  embeddings)

REMOVE 1015 document(s) from langchain collection


In [10]:
# Get all embeddings
len(db._collection.get()['ids'])

460

In [15]:
retriever = db.as_retriever(search_kwargs={'k': 25})
query = """Which locations have specifically received evacutaion orders?"""
docs = retriever.get_relevant_documents(query=query)
print(docs)

