# Downloading necessary libraries

In [None]:
!pip install -U -q llama-index pypdf
!pip install sentence_transformers
!pip install llama-index-finetuning
!pip install llama-index-llms-openai
!pip install llama-index-embeddings-huggingface

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.9/312.9 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

# Import Libraries

In [None]:
import json
import random
import os
import openai

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.finetuning import SentenceTransformersFinetuneEngine
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

# Creating the directory to upload books

In [None]:
!mkdir docs

# Loading all books and making nodes

In [None]:
def load_corpus(directory_path, verbose=False):
    if verbose:
        print(f"Loading files from the path: {directory_path}")

    reader = SimpleDirectoryReader(directory_path)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    splitter = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=120,
    )

    nodes = splitter.get_nodes_from_documents(docs, show_progress=verbose)
    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [None]:
nodes = load_corpus("docs", verbose=True)

Loading files from the path: docs
Loaded 250 docs


Parsing nodes:   0%|          | 0/250 [00:00<?, ?it/s]

Parsed 294 nodes


# Splitting the nodes into train and validation set

In [None]:
random.shuffle(nodes)
split_ratio = 0.8
split_index = int(split_ratio * len(nodes))

train_nodes = nodes[:split_index]
val_nodes = nodes[split_index:]

In [None]:
openai.api_key = "Open API Key"

# Prompt to make question answer pairs in Spanish Language

In [None]:
QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query. And
You are only required to generate output in English Language.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""

# Making training and validation dataset from nodes

In [None]:
from llama_index.llms.openai import OpenAI


train_dataset = generate_qa_embedding_pairs(
    nodes=train_nodes,
    llm=OpenAI(model="gpt-3.5-turbo"),
    qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk=5,
)

val_dataset = generate_qa_embedding_pairs(
    nodes=val_nodes,
    llm=OpenAI(model="gpt-3.5-turbo"),
    qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk=5,
)

100%|██████████| 235/235 [07:50<00:00,  2.00s/it]
100%|██████████| 59/59 [01:54<00:00,  1.94s/it]


# Saving the dataset to save API calls later

In [None]:
train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

# Loading the datasets

In [None]:
# Load
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

# Model to be finetuned

In [None]:
model_id = "mboth/distil-eng-quora-sentence"

# Finetuning parameters

In [None]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id=model_id,
    model_output_path="finetuned_model",
    val_dataset=val_dataset,
    epochs=7,
    show_progress_bar=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
finetune_engine.finetune()

# Creating Zip file of the finetuned model

In [None]:
!zip -r /content/finetuned_model.zip /content/finetuned_model

  adding: content/finetuned_model/ (stored 0%)
  adding: content/finetuned_model/eval/ (stored 0%)
  adding: content/finetuned_model/eval/Information-Retrieval_evaluation_results.csv (deflated 89%)
  adding: content/finetuned_model/modules.json (deflated 62%)
  adding: content/finetuned_model/model.safetensors (deflated 8%)
  adding: content/finetuned_model/1_Pooling/ (stored 0%)
  adding: content/finetuned_model/1_Pooling/config.json (deflated 57%)
  adding: content/finetuned_model/config.json (deflated 47%)
  adding: content/finetuned_model/sentence_bert_config.json (deflated 4%)
  adding: content/finetuned_model/config_sentence_transformers.json (deflated 31%)
  adding: content/finetuned_model/vocab.txt (deflated 53%)
  adding: content/finetuned_model/tokenizer_config.json (deflated 74%)
  adding: content/finetuned_model/2_Normalize/ (stored 0%)
  adding: content/finetuned_model/special_tokens_map.json (deflated 80%)
  adding: content/finetuned_model/README.md (deflated 56%)
  addin

# Save Model on Huggingface

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model = SentenceTransformer('/content/finetuned_model')

In [None]:
model.push_to_hub("osmanh/Harry_Potter_and_the_Sorcerers_Stone_en")

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

'https://huggingface.co/osmanh/Harry_Potter_and_the_Sorcerers_Stone_en/commit/08123441078952db44658e8d33ce647c16e4c7e8'

# Load Finetuned Model from Huggingface

In [None]:
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('/content/finetuned_model')
embeddings = model.encode(sentences)
# print(embeddings)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.67k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

[[ 7.37346411e-02  6.89090341e-02  5.14329374e-02  7.80908763e-02
   4.36954014e-02  3.12150782e-03  4.83673327e-02 -1.03322631e-02
   5.66385798e-02  3.20384987e-02  6.04997650e-02 -5.83738722e-02
   3.77713181e-02 -2.34927163e-02  2.95998193e-02  2.86025926e-04
   6.46146536e-02 -5.30021563e-02 -1.21144488e-01  2.56642308e-02
   3.38017866e-02  5.07279970e-02  2.61315927e-02 -8.04703799e-04
  -5.13856411e-02 -2.18119938e-02 -4.13429216e-02  5.87919988e-02
   1.22758292e-01 -4.54459302e-02 -7.39409700e-02 -2.45966427e-02
   4.90244403e-02  4.00097109e-02  5.99081488e-03  4.87610251e-02
  -1.98253430e-02  5.49821109e-02 -2.91323811e-02  3.00715142e-03
  -1.08989654e-02 -3.96884233e-02 -2.63057575e-02 -2.80674510e-02
   3.57339419e-02 -9.44776237e-02  4.49225990e-05  1.95522085e-02
   4.40503620e-02 -3.70283872e-02 -1.08732238e-01 -5.39410301e-02
  -9.52971429e-02 -1.25916898e-02  1.48094213e-02  5.30164465e-02
  -1.53570790e-02  6.95909858e-02  2.27494910e-02 -2.05125567e-02
   8.82138