In [None]:
import faiss
import openai
import numpy as np

# Set up your OpenAI API key
api_key = 'PUT HERE YOUR OPENAI API KEY'

# Sample documents
documents = [
    """
guideline:    
    Industry: 01

covers the two activities of production of crop products and production of animal products. It also includes organic farming and the cultivation of genetically modified crops and the keeping of genetically modified livestock.

covers both open-air and greenhouse farming.

It also includes the preparation of agricultural products for the commodity markets.

This division also includes the provision of services related to agriculture and commercial hunting, as well as trapping and related activities.

Group 01.5 (Mixed farming) is an exception to the basic rules for determining the main activity. It is assumed that many farms have a balance between crop and animal production and it would be arbitrary to classify them in one category or the other. does not include:
- processing of agricultural products other than preparation for the raw material markets (see divisions 10 and 12)
- creation of fields (e.g. terracing and drainage of agricultural land, creation of rice fields, etc.) (see section F)
- purchasing associations and agricultural marketing cooperatives (see section G)
- horticulture and landscaping (see 81.30.1)

exclusions:
False Keywords:
- F: construction
- G: trade; maintenance and repair of motor vehicles and motorcycles
- 10: manufacture of foodstuffs
- 12: tobacco processing
- 81.30.1: horticulture and landscaping
Processing of agricultural products other than preparation for the raw material markets (see divisions 10 and 12)
Field preparation (e.g. terracing and drainage of agricultural land, rice paddies, etc.) (see section F)
Purchasing associations and agricultural marketing cooperatives (see section G)
Gardening and landscaping (see 81.30.1)
F: Construction
G: Trade; repair of motor vehicles and motorcycles
10: Manufacture of foodstuffs
12: Manufacture of tobacco products
81.30.1: Gardening and landscaping

inclusions:
covers the two activities of production of crop products and production of animal products. It also includes organic farming and the cultivation of genetically modified crops and the rearing of genetically modified livestock. covers both open-air and greenhouse farming. It also includes the preparation of agricultural products for the commodity markets.Also included in this division are the provision of services related to agriculture and commercial hunting, as well as trapping and related activities.Group 01.5 (Mixed farming) is an exception to the basic rules for determining the main activity. It is assumed that many farms have a balance between crop and animal production and it would be arbitrary to classify them in one category or the other.

    """,
    #add more guidelines
]

# Query text
query = """
Company is producing crop products.
"""

# Function to get embeddings using OpenAI's embedding model
def get_embedding(text):
    response = openai.embeddings.create(
        input=text,
        model='text-embedding-ada-002'
    )
    embedding = response.data[0].embedding
    return np.array(embedding, dtype='float32')

# Get embeddings for the documents
document_embeddings = [get_embedding(doc) for doc in documents]
document_embeddings = np.array(document_embeddings)

# Normalize embeddings
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

normalized_embeddings = normalize_embeddings(document_embeddings)

# Create a Faiss index
dimension = normalized_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Using Inner Product for normalized vectors
index.add(normalized_embeddings)

# Get embedding for the query and normalize
query_embedding = get_embedding(query)
query_embedding = query_embedding / np.linalg.norm(query_embedding)

# Implement MMR
def mmr(doc_embeddings, query_embedding, top_k=5, diversity=0.5):
    """
    Maximal Marginal Relevance (MMR) for diversity in search results.
    """
    selected = []
    candidates = [i for i in range(len(doc_embeddings))]
    doc_embeddings = doc_embeddings.astype('float32')

    for _ in range(top_k):
        if not candidates:
            break
        # Calculate similarity between query and candidates
        sim_to_query = np.dot(doc_embeddings[candidates], query_embedding)
        # Calculate redundancy with selected documents
        if selected:
            sim_to_selected = np.max(np.dot(doc_embeddings[candidates], doc_embeddings[selected].T), axis=1)
        else:
            sim_to_selected = np.zeros(len(candidates))
        # MMR score
        mmr_score = (1 - diversity) * sim_to_query - diversity * sim_to_selected
        # Select the candidate with the highest MMR score
        selected_idx = np.argmax(mmr_score)
        selected.append(candidates.pop(selected_idx))

    return selected

# Retrieve top_k similar documents using MMR
top_k = 3
diversity = 0.7  # Adjust between 0 (most similar) and 1 (most diverse)
selected_indices = mmr(normalized_embeddings, query_embedding, top_k=top_k, diversity=diversity)
selected_documents = [documents[i] for i in selected_indices]

# Prepare the prompt for GPT-4
system_prompt = """
    You are a helpful AI assistant with access to descriptions of German industry codes and a set of information
    about the company . Based on the provided information, your role is to match German industry codes to the data provided.
    Do not infer or guess information that is not explicitly stated in the provided information.

    PROVIDE ONLY THE MAIN_CODE AND OTHER_CODES KEYS WITH THEIR RESPECTIVE VALUES; NEVER INCLUDE ANY EXPLANATION
    OR CONTEXT IN THE OUTPUT
    
    the example output should be:
    'main_code': '11.11.1'
    'other_codes': [22.22.2, 33.33.3]
"""
user_prompt = f"""
    Below you can find descriptions of the data for the company:
    {query}\n\n
    Here are the descriptions of German industry codes (Klassifikation der Wirtschaftszweige, Ausgabe 2008).
    Only base your answers on the codes provided below; do not use any other codes apart from those below:\n
"""
for idx, doc in enumerate(selected_documents):
    user_prompt += f"Example {idx+1}: {doc}\n"

# Call GPT-4 API for classification
response = openai.OpenAI(api_key=api_key).chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
)

# Output the classification
print("Classification Result:")
print(response.choices[0].message.content)