<a href="https://colab.research.google.com/github/maahir-garg/llama2-finetuning/blob/main/creating_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets transformers sentence_transformers faiss-gpu

In [None]:
from google.colab import userdata
hf_token = userdata.get('huggingface')

In [None]:
from datasets import load_dataset

dataset = load_dataset('garage-bAInd/Open-Platypus')

In [None]:
dataset["train"].to_pandas()

In [None]:
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import seaborn as sns

tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-hf')

In [None]:
instruction_token_counts = [len(tokenizer.tokenize(example["instruction"])) for example in dataset["train"]]
output_token_counts = [len(tokenizer.tokenize(example["output"])) for example in dataset["train"]]
combined_token_counts = [instruction + output for instruction, output in zip(instruction_token_counts, output_token_counts)]
combined_token_counts

In [None]:
def plot_distribution(token_counts, title):
  sns.set_style("whitegrid")
  plt.figure(figsize=(15, 6))
  plt.hist(token_counts, bins=50, color="#3498db", edgecolor="black")
  plt.title(title, fontsize=16)
  plt.xlabel("Number of tokens", fontsize=14)
  plt.ylabel("Number of examples", fontsize=14)
  plt.xticks(fontsize=12)
  plt.yticks(fontsize=12)
  plt.tight_layout()
  plt.show()

plot_distribution(instruction_token_counts, "Distribution of token counts for instruction")
plot_distribution(output_token_counts, "Distribution of token counts for output")
plot_distribution(combined_token_counts, "Distribution of token counts for combined")

In [None]:
valid_indices = [i for i, count in enumerate(combined_token_counts) if count <= 2048]
print(len(dataset["train"]) - len(valid_indices))

dataset["train"] = dataset["train"].select(valid_indices)

token_counts = [combined_token_counts[i] for i in valid_indices]

plot_distribution(token_counts, "Combined distribution after filtering")

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
from datasets import Dataset, DatasetDict
from tqdm.autonotebook import tqdm
import numpy as np

def deduplicate_dataset(dataset: Dataset, model: str, threshold: float):
  sentence_model = SentenceTransformer(model)
  outputs = [example["output"] for example in dataset["train"]]

  print("Converting text to embeddings..")
  embeddings = sentence_model.encode(outputs, show_progress_bar=True)
  dimension = embeddings.shape[1]
  index = faiss.IndexFlatIP(dimension)
  normalised_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
  index.add(normalised_embeddings)

  print("Filtering out near duplicates..")
  D, I = index.search(normalised_embeddings, k=2)
  to_keep = []
  for i in tqdm(range(len(embeddings)), desc="Filtering"):
    if D[i, 1] < threshold:
      to_keep.append(i)
  dataset = dataset["train"].select(to_keep)
  return DatasetDict({"train": dataset})

deduped_dataset = deduplicate_dataset(dataset, "thenlper/gte-large", 0.95)

In [None]:
print(len(dataset["train"]))
print(len(deduped_dataset["train"]))
print(len(deduped_dataset["train"]) - len(dataset["train"]))

In [None]:
def get_top_k_rows(dataset, token_counts, k):
  sorted_indices = sorted(range(len(token_counts)), key= lambda i: token_counts[i], reverse=True)
  top_k_indices = sorted_indices[:k]

  top_k_data = {
      "instruction": [dataset["train"][i]["instruction"] for i in top_k_indices],
      "output": [dataset["train"][i]["output"] for i in top_k_indices]
  }

  return Dataset.from_dict(top_k_data)

k = 1000
instruction_token_counts = [len(tokenizer.tokenize(example["instruction"])) for example in dataset["train"]]
output_token_counts = [len(tokenizer.tokenize(example["output"])) for example in dataset["train"]]
combined_token_counts = [instruction + output for instruction, output in zip(instruction_token_counts, output_token_counts)]

top_k_dataset = get_top_k_rows(dataset, combined_token_counts, k)
dataset = DatasetDict({"train": top_k_dataset})


In [None]:
instruction_token_counts = [len(tokenizer.tokenize(example["instruction"])) for example in dataset["train"]]
output_token_counts = [len(tokenizer.tokenize(example["output"])) for example in dataset["train"]]
combined_token_counts = [instruction + output for instruction, output in zip(instruction_token_counts, output_token_counts)]

plot_distribution(instruction_token_counts, "Distribution of token counts for instruction")
plot_distribution(output_token_counts, "Distribution of token counts for output")
plot_distribution(combined_token_counts, "Distribution of token counts for combined")

In [None]:
dataset["train"].to_pandas()

In [None]:
def chat_template(example):
    example["instruction"] = f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
    return example

dataset = dataset.map(chat_template)

In [None]:
dataset["train"].to_pandas()

In [None]:
dataset.push_to_hub("refined-platypus", token=hf_token)