# Merge all datasets together and reduce dataset based on similarity metric

In [None]:
!pip install -U sentence-transformers tqdm ipywidgets numba

In [None]:
import pandas as pd
import numpy as np
import pathlib as plb

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import tqdm

In [None]:
DATA_DIR = "../../data/processed/"
EMBEDDING_DIR = "../../data/instruction_embeddings"
data_dir = plb.Path(DATA_DIR)
embedding_dir = plb.Path(EMBEDDING_DIR)

embedding_dir.mkdir(parents=True, exist_ok=True)

In [None]:
files = data_dir.glob("*.jsonl")

In [None]:
data = []
for f in files:
    if f.stem == "helm_training_data":
        continue
    df = pd.read_json(f, lines=True)
    if not 'dataset' in df.columns:
        df["dataset"] = f.stem
    data.append(df)

In [None]:
df_data = pd.concat(data, ignore_index=True)

In [None]:
df_data["dataset"].value_counts()

In [None]:
df_data_bias = df_data.loc[lambda df: df['dataset'].isin(['stereoset_intersentence', 'stereoset_intrasentence', 'crows_pairs'])].reset_index(drop=True)

In [None]:
df_data = df_data.loc[lambda df: ~df['dataset'].isin(['stereoset_intersentence', 'stereoset_intrasentence', 'crows_pairs'])]

In [None]:
df_helm = pd.read_json(data_dir / "helm_training_data.jsonl", lines=True).drop(columns=["method"], axis=1)
df_helm.shape

### Filter data using cosine similarity

See [open platypus](https://github.com/arielnlee/Platypus/blob/main/data_pipeline/data_similarity.ipynb) repository

In [None]:
df_data["combined_instruction"] = (df_data["instruction"] + ' ' + df_data["input"]).str.rstrip()

In [None]:
df_data["combined_instruction"]

In [None]:
device = "mps"

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [None]:
concatenated_instructions = df_data["combined_instruction"].tolist()

In [None]:
instruction_embeddings = model.encode(concatenated_instructions)

In [None]:
instruction_embeddings.shape

In [None]:
np.save(EMBEDDING_DIR + "/instruction-MiniLM-L6-v2-embeddings", instruction_embeddings)

In [None]:
instruction_embeddings = np.load(EMBEDDING_DIR + "/instruction-MiniLM-L6-v2-embeddings.npy")

In [None]:
# Shuffle rows
df_data_shuffled = df_data.sample(frac=1, random_state=3724376)

In [None]:
json_data = df_data_shuffled.to_dict(orient="records")

In [None]:
instruction_embeddings.shape

In [None]:
import numpy as np
import numba

@numba.jit(nopython=True)
def cosim(u, M):
    scores = np.zeros(M.shape[0])
    for i in numba.prange(M.shape[0]):
        v = M[i]
        m = u.shape[0]
        udotv = 0
        u_norm = 0
        v_norm = 0
        for j in range(m):
            if (np.isnan(u[j])) or (np.isnan(v[j])):
                continue

            udotv += u[j] * v[j]
            u_norm += u[j] * u[j]
            v_norm += v[j] * v[j]

        u_norm = np.sqrt(u_norm)
        v_norm = np.sqrt(v_norm)

        if (u_norm == 0) or (v_norm == 0):
            ratio = 1.0
        else:
            ratio = udotv / (u_norm * v_norm)
        scores[i] = ratio
    return scores

In [None]:
instruction_embeddings.shape

In [None]:
cosim(instruction_embeddings[0], instruction_embeddings[:10000,:])

In [None]:
existing_embeddings = np.empty((0, 384), np.float32)

In [None]:
np.concatenate((existing_embeddings, instruction_embeddings[0,:].reshape(1,-1)), axis=0).shape

In [None]:
final_data = []
existing_embeddings = np.empty((0, 384), np.float32)

for r in tqdm.notebook.tqdm(enumerate(json_data)):
    i, d = r
    if not final_data:
        final_data.append(d)
        existing_embeddings = np.concatenate((existing_embeddings, instruction_embeddings[i].reshape(1, -1)), axis=0)
    else:
        similarity_scores = cosim(instruction_embeddings[i], existing_embeddings)
        
        if np.max(similarity_scores) <= 0.75:
            final_data.append(d)
            existing_embeddings = np.concatenate((existing_embeddings, instruction_embeddings[i].reshape(1, -1)), axis=0)

In [None]:
df_dedup = pd.DataFrame(final_data)

In [None]:
df_dedup.to_json(data_dir.parent / "sft_train_shuffled_reduced_all_ds_nshot.jsonl", orient="records", lines=True)

In [None]:
df_dedup = pd.read_json(data_dir.parent / "sft_train_shuffled_reduced_all_ds_nshot.jsonl", lines=True)

In [None]:
df_medq = df_dedup.loc[lambda df: df["dataset"] == "MedQuad"].sample(1000, random_state=674675)
df_dedup = df_dedup.loc[lambda df: df["dataset"] != "MedQuad"]

df_dedup = pd.concat([df_dedup, df_medq], ignore_index=True)

In [None]:
df_all = pd.concat([df_dedup, df_helm, df_data_bias], ignore_index=True).sample(frac=1, random_state=123663).reset_index(drop=True).drop(columns=["combined_instruction"])

In [None]:
df_data_bias

In [None]:
df_all

In [None]:
df_all.to_json(data_dir.parent / "sft_train_shuffled_reduced_all_ds.jsonl", orient="records", lines=True)

In [None]:
df_dedup['dataset'].value_counts()

In [None]:
df_dedup.to_json('../../data/sft_train_shuffled_reduced_all_ds.jsonl', orient='records', lines=True)