In [None]:
#!pip install sentence-transformers
#!pip install umap-learn
#!pip install cluestar
#!pip install openai
#!pip install langchain

In [None]:
from dotenv import load_dotenv
from os import environ
load_dotenv()

from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage


DEPLOYMENT_NAME = "gpt-35-turbo"
chat_llm = AzureChatOpenAI(
    # openai_api_base=BASE_URL,
    # openai_api_version="2023-05-15",
    deployment_name=DEPLOYMENT_NAME,
    # openai_api_key=API_KEY,
    # openai_api_type="azure",
)

In [None]:
def query(content):
    return chat_llm(
        [
            HumanMessage(
                content=content
            )
        ]
    ).content

In [None]:
query("do you read me?")

In [None]:
from sentence_transformers import SentenceTransformer

device = "mps"

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [None]:
import pandas as pd
df = pd.read_json("../../data/processed/guanaco.jsonl",lines=True)

In [None]:
df['instruction_input'] = df['instruction'] + "\n\n" + df['input']

In [None]:
instruction_embeddings = model.encode(df['instruction_input'])

In [None]:
from umap import UMAP


x = UMAP(n_neighbors=10, n_components=1).fit_transform(instruction_embeddings)
X = UMAP().fit_transform(instruction_embeddings)

In [None]:
df['x'] = x
df = df.sort_values(by='x')

In [None]:
base_prompt = """

You are a data curator.
Your knowledge cutoff is end of december 2022.
You have no knowledge of current or future events, that includes the weather and the current time.
You are not human, you have no feelings. You have no personal preferences.

You have two goals:
1. Remove duplicate content
2. Remove questions that are not aligned with the objective of training a model that is great at answering multiple choice or logic reasoning questions.

You will receive a set of json formatted questions.
Your task is to return for each if it should be:
A. Kept
B. Removed

Provide your answer in the following FORMAT

FORMAT:
```json
{
  < identifier >: {
    'decision': < "KEEP" or "REMOVE" >
    'reason': < "SIMILARITY" or "BAD QUESTION" "GOOD QUESTION" >
    }
}
```

QUESTIONS:
""".strip()

In [None]:
import json
output = []
for i in range(0,len(df),15):
    subset = df.iloc[i:i+15]
    prompt = base_prompt + json.dumps(subset.to_dict())
    out = query(prompt)
    output.append(out)

## Plotting

In [None]:
from cluestar import plot_text

plot_text(X, df['instruction_input'])