# Industry codes classification for company descriptions with LLM and RAG 

In [19]:
%%capture --no-display
!pip install langchain_community
!pip install langchain_openai
!pip install python-dotenv
!pip install faiss-cpu

In [20]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import openai
import numpy as np
import pandas as pd

# put OPENAI_API_KEY="*****" in .env file
load_dotenv()

# sample documents
documentsDf = pd.read_csv('data/industry_guidelines.csv', quotechar='"', header=None)
documents = documentsDf.iloc[0].to_list()


In [21]:
# company description to classify that will be used as query text
query = """
Company is producing crop products.
"""

def get_embedding(text: str)->np.ndarray:
    """
    get embeddings using OpenAI's embedding model
    """
    response = openai.embeddings.create(
        input=text,
        model='text-embedding-ada-002'
    )
    embedding = response.data[0].embedding
    return np.array(embedding, dtype='float32')

# get embeddings for the documents
document_embeddings = [(doc, get_embedding(doc)) for doc in documents]
faiss = FAISS.from_embeddings(document_embeddings, OpenAIEmbeddings())


In [22]:
# retrieve top_k similar documents using cosine similarity score
retriever = faiss.as_retriever(search_type='similarity', search_kwargs={"k": 14})
selected_documents= retriever.invoke(query)


In [23]:
# prepare the prompt for GPT-4
system_prompt = """
    You are a helpful AI assistant with access to descriptions of German industry codes and a set of information
    about the company. Based on the provided information, your role is to match German industry codes to the data provided.
    Do not infer or guess information that is not explicitly stated in the provided information.

    PROVIDE ONLY THE MAIN_CODE AND OTHER_CODES KEYS WITH THEIR RESPECTIVE VALUES; NEVER INCLUDE ANY EXPLANATION
    OR CONTEXT IN THE OUTPUT
    
    the example output should be:
    'main_code': '11.11.1'
    'other_codes': [22.22.2, 33.33.3]
"""
user_prompt = f"""
    Below you can find description of the data for the company:
    {query}\n\n
    Here are the descriptions of German industry codes (Klassifikation der Wirtschaftszweige, Ausgabe 2008).
    Base your answers only on the codes provided below; do not use any other codes apart from those below:\n
"""
for idx, doc in enumerate(selected_documents):
    user_prompt += f"Example {idx+1}: {doc}\n"

# call GPT-4 API for classification results
response = openai.OpenAI().chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
)

# output the classification result
print("Classification Result:")
print(response.choices[0].message.content)

print("SYSTEM PROMPT")
print(system_prompt)

print("USER PROMPT")
print(user_prompt)

Classification Result:
'main_code': '01',
'other_codes': []
SYSTEM PROMPT

    You are a helpful AI assistant with access to descriptions of German industry codes and a set of information
    about the company. Based on the provided information, your role is to match German industry codes to the data provided.
    Do not infer or guess information that is not explicitly stated in the provided information.

    PROVIDE ONLY THE MAIN_CODE AND OTHER_CODES KEYS WITH THEIR RESPECTIVE VALUES; NEVER INCLUDE ANY EXPLANATION
    OR CONTEXT IN THE OUTPUT
    
    the example output should be:
    'main_code': '11.11.1'
    'other_codes': [22.22.2, 33.33.3]

USER PROMPT

    Below you can find description of the data for the company:
    
Company is producing crop products.



    Here are the descriptions of German industry codes (Klassifikation der Wirtschaftszweige, Ausgabe 2008).
    Base your answers only on the codes provided below; do not use any other codes apart from those below:

Exampl