In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
!which python

/root/anaconda3/envs/dipmaind/bin/python


In [3]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/workspace/transformers_cache/'

In [4]:
CSV_NAICS_DESC_PATH = '../data/naics_descriptions.csv'

## Installing all the dependencies

In [5]:
!pip -q install langchain huggingface_hub  langchain_openai lancedb openai  tiktoken  rank_bm25 pypdf angle-emb scikit-learn matplotlib sentence-transformers

[0m

## Hybrid Search

**BM25 Retriever** - Sparse retriever

**Embeddings** - Dense retrievers Lancedb

`Hybrid search = Sparse + Dense retriever`

## Load the data

In [6]:
# load single pdf
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(CSV_NAICS_DESC_PATH)
pages = loader.load_and_split()

## Importing all the libraries

In [7]:
from langchain.vectorstores import LanceDB
import lancedb
from langchain.retrievers import BM25Retriever, EnsembleRetriever, TFIDFRetriever
from langchain.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

## Initialize Embeddings

In [8]:
from angle_emb import AnglE, Prompts

angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()
angle.set_prompt(prompt=Prompts.C)
vec = angle.encode({'text': 'hello world'}, to_numpy=True)
print(vec)
vecs = angle.encode([{'text': 'hello world1'}, {'text': 'hello world2'}], to_numpy=True)
print(vecs)

INFO:AnglE:Prompt is set, the prompt will be automatically applied during the encoding phase. To disable prompt setting, please configure set_prompt(prompt=None)


[[ 0.37528032  0.08762018  0.52706105 ... -0.2676602   0.02662158
  -0.14156052]]
[[ 0.49359986  0.16666076  0.46393117 ... -0.16278437 -0.10181466
  -0.2392829 ]
 [ 0.2607216   0.63581985  0.8870626  ... -0.40785697 -0.2537569
  -0.15815191]]


In [9]:
import torch

torch.set_default_device('cuda:0')

In [10]:
from typing import Any, Optional
from langchain_core.pydantic_v1 import BaseModel, Extra, Field
from langchain_core.embeddings import Embeddings


DEFAULT_UAE_MODEL = "WhereIsAI/UAE-Large-V1"

class HuggingFaceUaeEmbeddings(BaseModel, Embeddings):
    """HuggingFace UAE sentence embedding models.
    Arxiv: https://arxiv.org/abs/2309.12871
    To use, you should have the ``angle_emb`` python package installed.
    Example:
        .. code-block:: python
            from langchain_community.embeddings import HuggingFaceUaeEmbeddings
            model_name = "WhereIsAI/UAE-Large-V1"
            model_kwargs = {
                'device': 'cpu',
                'pooling_strategy': 'cls',
            }
            encode_kwargs = {'to_numpy': True}
            prompt = None
            hf = HuggingFaceUaeEmbeddings(
                model_name=model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs,
                prompt=prompt
            )
    """

    client: Any  #: :meta private:
    model_name: str = DEFAULT_UAE_MODEL
    """Model name to use."""
    model_kwargs: dict[str, Any] = Field(default_factory=dict)
    """Keyword arguments to pass to the model."""
    encode_kwargs: dict[str, Any] = Field(default_factory=dict)
    """Keyword arguments to pass when calling the `encode` method of the model."""
    prompt: Optional[str] = None
    """prompt argument"""

    def __init__(self, **kwargs: Any):
        """Initialize the angle_emb."""
        super().__init__(**kwargs)
        try:
            import angle_emb

        except ImportError as exc:
            raise ImportError(
                "Could not import angle_emb python package. "
                "Please install it with `pip install angle_emb`."
            ) from exc

        # self.client = angle_emb.AnglE(
        #     self.model_name, **self.model_kwargs
        # ).cuda()

        self.client = angle_emb.AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()
        self.client.set_prompt(prompt=self.prompt)

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        """Compute doc embeddings using a HuggingFace transformer model.
        Args:
            texts: The list of texts to embed.
        Returns:
            List of embeddings, one for each text.
        """
        texts = [t.replace("\n", " ") for t in texts]
        if isinstance(self.prompt, str):
            texts = [{'text': text} for text in texts]
        embeddings = self.client.encode(texts, **self.encode_kwargs)
        return embeddings.tolist()

    def embed_query(self, text: str) -> list[float]:
        """Compute query embeddings using a HuggingFace transformer model.
        Args:
            text: The text to embed.
        Returns:
            Embeddings for the text.
        """
        return self.embed_documents([text])[0]

In [11]:
embedding = HuggingFaceUaeEmbeddings()

In [12]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embedding2 = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


## Initialize the BM25

In [13]:
# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(pages)
bm25_retriever.k = 2  # Retrieve top 2 results

print("type of bm25", type(bm25_retriever))

type of bm25 <class 'langchain_community.retrievers.bm25.BM25Retriever'>


In [14]:
tfidf_retriever = TFIDFRetriever.from_documents(pages)
tfidf_retriever.k = 2  # Retrieve top 2 results

print("type of tfidf", type(tfidf_retriever))

type of tfidf <class 'langchain_community.retrievers.tfidf.TFIDFRetriever'>


## Initialize the database

In [15]:
db = lancedb.connect("/tmp/lancedb")
table = db.create_table(
    "pandas_docs",
    data=[
        {
            "vector": embedding.embed_query("Unknown"),
            "text": "Unknown",
        },

    ],
    mode="overwrite",
)

# table2 = db.create_table(
#     "pandas_docs2",
#     data=[
#         {
#             "vector": embedding2.embed_query("Unknown"),
#             "text": "Unknown",
#         },

#     ],
#     mode="overwrite",
# )

## Instantiate the retriever

In [16]:
# Initialize LanceDB retriever
docsearch = LanceDB.from_documents(pages, embedding, connection=table)
retriever_lancedb = docsearch.as_retriever(search_kwargs={"k": 2})

# Initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever_lancedb], weights=[0.2, 0.8]
)

## Query

In [17]:
# # Example customer query
# query = "BestToursRo"


# # Retrieve relevant documents/products
# # docs = ensemble_retriever.get_relevant_documents(query)
# docs = retriever_lancedb.get_relevant_documents(query)
# # docs = bm25_retriever.get_relevant_documents(query)
# # docs = tfidf_retriever.get_relevant_documents(query)


# # Extract and print only the page content from each document
# for doc in docs:
#     print('BEFORE')
#     # print(doc.page_content)
#     print(doc)
#     print('AFTER')
#     print('\n')


# # docs

# Evaluate RAG

In [18]:
# Initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[tfidf_retriever, retriever_lancedb], weights=[0.1, 0.9]
)

# docsearch2 = LanceDB.from_documents(pages, embedding2, connection=table2)
# retriever_lancedb2 = docsearch2.as_retriever(search_kwargs={"k": 2})

# ensemble_retriever = EnsembleRetriever(
#     retrievers=[retriever_lancedb2, retriever_lancedb], weights=[0.5, 0.5]
# )

def get_naics(query):
    docs = ensemble_retriever.get_relevant_documents(query)
    # docs = retriever_lancedb.get_relevant_documents(query)
    # docs = bm25_retriever.get_relevant_documents(query)
    # docs = tfidf_retriever.get_relevant_documents(query)
    print('CALLED get_naics')

    # Extract and print only the page content from each document
    for doc in docs:
        pred_text = doc.page_content
        if pred_text != 'Unknown':
            return pred_text.split('\n')[0][-3:]

    return 'Unknown'

In [19]:
import time
import requests

base_url = 'http://116.202.111.229:8000'
api_key = 'Yhd6ykfBmyvyuqEQQV0n7GdPPNXMklgO'

headers = {
    'x-api-key': api_key
}

# Get hints about a new company
#response = requests.get(f"{base_url}/evaluate/reset", headers=headers)

#print(response.status_code, response.json())


s = ''

for stage in range(1, 6):
    # Get a new hint for current company or get the first hint for a new company after calling /evaluate/reset
    response = requests.get(f"{base_url}/evaluate/hint", headers=headers)

    # print('RESPONS', response.json())
    hint = response.json()['hint']
    print('HINT', hint)
    s = s + '\n' + hint

    # Post your answer for current hint
    if stage == 5:
        data = {
            'answer': 'abstain'
        }
    else:
        data = get_naics(s)
        data = {
            'answer': data
        }
    print('DATA', data)
 
    response = requests.post(f"{base_url}/evaluate/answer", json=data, headers=headers)
    print(response.status_code, response.json())


HINT Oncall Care Agency is a specialist employment business, which supplies to care homes, hospitals and other healthcare clients in Buckinghamshire with highly trained nurses, social workers, care assistants, support workers and more.
CALLED get_naics
DATA {'answer': '623'}
429 {'title': 'Reached allowed limit 1 hits per 1 second!'}


KeyError: 'hint'

In [None]:
# # SUPERVISED_PATH = 'drive/MyDrive/HBN2/data/first_100_rows.csv'
# SUPERVISED_PATH = '../data/first_2k_fixed.csv'


In [None]:
# import pandas as pd

# df_supervised = pd.read_csv(SUPERVISED_PATH)

# business_one = df_supervised.groupby('Main Business Category')['predicted_naics'].apply(lambda x: len(x)) == 1

# b1 = business_one.index[business_one]

# df_2col = df_supervised[df_supervised['Main Business Category'].isin(b1)][['Main Business Category', 'predicted_naics']]

# mbc_map = {k:v for k, v in zip(df_2col['Main Business Category'].tolist(), df_2col['predicted_naics'].tolist())}
# mbc_map

In [None]:
# import pandas as pd
# import numpy as np
# from tqdm import tqdm

# def compute_and_evaluate(SUPERVISED_PATH):
#     df = pd.read_csv(SUPERVISED_PATH)
#     stages = ['Commercial Name', 'Business Tags', 'Short Description', 'Long Description', 'Main Business Category']
#     df_benchmark = pd.DataFrame(columns=stages)

#     for i in range(len(stages)):
#         cols = stages[:(i+1)]

#         if i == 0:
#             df['t1'] = df[stages[0]]
#             continue
#         # TODO: undo
#         df[f't{i+1}'] = df[cols].apply(lambda row: '\n---\n'.join(row.values.astype(str)), axis=1)
#         # df[f't{i+1}'] = df[cols[-1]]

#     df_orig = df.copy()
#     COUNT = 0

#     df = df.drop(columns=stages)
#     score_matrix = np.zeros(shape=(len(df), 5))

#     gains = [500, 400, 300, 200, 100]
#     penalties = [-50, -100, -150, -200, -250]

#     for i, row in tqdm(df.iterrows()):
#         ground_truth = row['predicted_naics']
#         for j, stage_info in enumerate(row[2:]):
#             if j == 4 and df_orig.loc[i, 'Main Business Category'] in mbc_map:
#                 pred = mbc_map[df_orig.loc[i, 'Main Business Category']]
#                 COUNT += 1
#                 print('COUNT', COUNT)
#             else:
#                 pred = int(get_naics(stage_info))
                
#             if pred == ground_truth:
#                 outcome = 1
#             else:
#                 outcome = -1
#             # print('ground_truth', ground_truth, type(ground_truth))
#             # print('pred', pred, type(pred))

#             # print(f'{outcome}*{ground_truth}*{pred}*')
#             score_matrix[i, j] = gains[j] if outcome > 0 else penalties[j]

#     # print(score_matrix[:3, :])
#     round_scores = score_matrix.sum(axis=1)

#     df_benchmark = pd.DataFrame(0, index=np.arange(len(df)), columns=[
#         'no-abstain',
#         *[f'abstain-t{stage}' for stage in range(1, 6)]
#     ])

#     df_benchmark.loc[:, 'no-abstain'] = round_scores
#     for stage in range(1, 6):
#         df_benchmark.loc[:, f'abstain-t{stage}'] = round_scores - score_matrix[:, stage-1]

#     return score_matrix, df_benchmark


In [None]:

# score_matrix, df_benchmark = compute_and_evaluate(SUPERVISED_PATH)

In [None]:
# df_score_matrix = pd.DataFrame(score_matrix, index=np.arange(score_matrix.shape[0]), columns=['t1', 't2', 't3', 't4', 't5'])

In [None]:
# df_score_matrix

In [None]:
# df_score_matrix.mean()

In [None]:
# df_score_matrix.median()

In [None]:
# print(df_score_matrix['t1'].value_counts() / len(df_score_matrix))
# print(df_score_matrix['t2'].value_counts() / len(df_score_matrix))
# print(df_score_matrix['t3'].value_counts() / len(df_score_matrix))
# print(df_score_matrix['t4'].value_counts() / len(df_score_matrix))
# print(df_score_matrix['t5'].value_counts() / len(df_score_matrix))

In [None]:
# df_benchmark

In [None]:
# df_benchmark.mean()

In [None]:
# df_benchmark.median()

In [None]:
# df_benchmark.hist(figsize=(20, 10))