In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U pip
!pip install -q -U datasets
!pip install -q -U accelerate
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U einops
!pip install -q -U safetensors
!pip install -q -U torch
!pip install -q -U xformers
!pip install -q -U langchain
!pip install -q -U pypdf
!pip install -q -U pymupdf
!pip install -q -U faiss-gpu
!pip install -q -U bert_score
!pip install -q -U spacy
!pip install -q typing-inspect==0.8.0 
!pip install -q typing_extensions==4.5.0
!pip install -q pydantic==1.10.11

In [None]:
import os
import logging
import transformers
import bert_score
import torch
import pandas as pd
import bitsandbytes as bnb
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from matplotlib import rcParams
from bert_score import score, BERTScorer
from datasets import load_dataset
from operator import itemgetter
from langchain import HuggingFacePipeline
from langchain import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader, DirectoryLoader
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    pipeline,
)

## just to suppress warnings for things like not running on GPU when using langchain
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)


if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else :
    device = torch.device('cpu')

device

In [None]:
PROJECT_ROOT = '.'
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')
MODELS_DIR = os.path.join(PROJECT_ROOT, 'models')
RESULTS_DIR = os.path.join(PROJECT_ROOT, 'results')

TRAIN_DATA_SOURCE = 'mjphayes/textbook_dataset'
TRAIN_DATA_CACHE = os.path.join(PROJECT_ROOT, DATA_DIR, 'textbook-dataset')

EVAL_DATA_SOURCE = 'mjphayes/machine_learning_questions'
EVAL_DATA_CACHE = os.path.join(PROJECT_ROOT, DATA_DIR, 'machine-learning-questions')

GPT2_CHECKPOINT = 'distilgpt2'
GPT2_CACHE_DIR = os.path.join(PROJECT_ROOT, MODELS_DIR, GPT2_CHECKPOINT)

GPT2_FINETUNE_CHECKPOINT = 'mjphayes/distilgpt2-finetuned-textbook_dataset'
GPT2_FINETUNE_CACHE_DIR = os.path.join(PROJECT_ROOT, MODELS_DIR, GPT2_FINETUNE_CHECKPOINT)

FALCON_CHECKPOINT = "vilsonrodrigues/falcon-7b-instruct-sharded"
FALCON_CACHE_DIR = os.path.join(PROJECT_ROOT, MODELS_DIR, 'falcon-7b-instruct')

FALCON_FINETUNE_CHECKPOINT = 'mjphayes/falcon-7b-instruct-textbook_dataset'
FALCON_FINETUNE_CACHE_DIR = os.path.join(PROJECT_ROOT, MODELS_DIR, FALCON_FINETUNE_CHECKPOINT)

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

In [None]:
path_to_texts = os.path.join(DATA_DIR, 'textbooks.txt')
try:
    text_books = open(path_to_texts, 'r').read()
except:
    !wget 'https://gist.githubusercontent.com/mitchelljphayes/82de40eb4ec9275c9b3403fa53665fde/raw/88b0d35d78b4b65d02384980b3e106f20767f7c6/textbooks.txt' -P $DATA_DIR
    text_books = open(path_to_texts, 'r').read()

# Loading Falcon

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
falcon_4bit = AutoModelForCausalLM.from_pretrained(
        FALCON_CHECKPOINT, 
        device_map='auto',
        quantization_config=quantization_config,
        cache_dir=FALCON_CACHE_DIR,
        )
falcon_tokenizer = AutoTokenizer.from_pretrained(FALCON_CHECKPOINT, cache_dir=FALCON_CACHE_DIR)

In [None]:
falcon_tokenizer.pad_token = falcon_tokenizer.eos_token

In [None]:
def print_trainable_parameters(model):
  """
  Prints the number of trainable parameters in the model.
  """
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()
  print(
      f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
  )

In [None]:
falcon_4bit.gradient_checkpointing_enable()
falcon_4bit = prepare_model_for_kbit_training(falcon_4bit)

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

falcon_4bit = get_peft_model(falcon_4bit, config)
print_trainable_parameters(falcon_4bit)

In [None]:
prompt = """
<human>: What is machine learning?
<assistant>:
""".strip()

In [None]:
generation_config = falcon_4bit.generation_config
generation_config.max_new_tokens = 512
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = falcon_tokenizer.eos_token_id
generation_config.eos_token_id = falcon_tokenizer.eos_token_id

In [None]:
%%time
encoding = falcon_tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
  outputs = falcon_4bit.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

print(falcon_tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
def falcon_inference(question, model, tokenizer, device, generation_config=generation_config):
    prompt = f"""
    Question: {question}
    Answer:
    """.strip()

    model.to(device)
    encoding = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
      outputs = model.generate(
          input_ids = encoding.input_ids,
          attention_mask = encoding.attention_mask,
          generation_config = generation_config
      )
    inital_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = inital_answer.split('Answer:')[-1]
    return first_sentences

In [None]:
def run_evaluation(evaluation_dataset, inference_function, model, tokenizer, device):
    results = []
    for i in tqdm(range(len(evaluation_dataset))):
        question = evaluation_dataset[i]['question']
        answer = inference_function(question, model, tokenizer, device)
        results.append(answer)
    return results

In [None]:
scorer = BERTScorer(lang="en")
def evaluate_answers(dataset, generated_answers, scorer=scorer):
    ideal = dataset['answer']
    P, R, F1 = scorer.score(ideal, generated_answers)
    df = pd.DataFrame({'question': dataset['question'], 'ideal': ideal, 'generated': generated_answers, 'P': P, 'R': R, 'F1': F1})
    return df

## Fine tuning falcon

In [None]:
textbook_dataset = load_dataset(TRAIN_DATA_SOURCE, cache_dir=TRAIN_DATA_CACHE)

In [None]:
def tokenize_for_falcon(examples):
    return falcon_tokenizer(examples["text"])

In [None]:
tokenized_text = textbook_dataset.map(tokenize_for_falcon, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
split_data = tokenized_text['train'].train_test_split(test_size=0.05)

In [None]:
block_size = falcon_tokenizer.model_max_length
# block_size = 128
block_size

In [None]:
def group_inputs(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
grouped_data = split_data.map(
    group_inputs,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
grouped_data

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=8,
    evaluation_strategy="epoch", 
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir="experiments",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
)

trainer = Trainer(
    model=falcon_4bit,
    train_dataset=grouped_data['train'],
    eval_dataset=grouped_data['test'],
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(falcon_tokenizer, mlm=False)
)
falcon_4bit.config.use_cache = False

In [None]:
trainer.train()

In [None]:
trainer.save_model(FALCON_FINETUNE_CHECKPOINT)

In [None]:
try:
    falcon_finetune = falcon_4bit
except:  
    falcon_finetune = AutoModelForCausalLM.from_pretrained(
        FALCON_FINETUNE_CHECKPOINT,
        quantization_config=quantization_config, 
        cache_dir=FALCON_FINETUNE_CACHE_DIR,
        device_map='auto'
        )

In [None]:
eval_data = load_dataset(EVAL_DATA_SOURCE, cache_dir=EVAL_DATA_CACHE)
eval_train_data = eval_data['test']

In [None]:
def generate_prompt(data_point):
  return f"""
<human>: {data_point["question"]}
<assistant>: {data_point["answer"]}
""".strip()

def generate_and_tokenize_prompt(data_point):
  full_prompt = generate_prompt(data_point)
  tokenized_full_prompt = falcon_tokenizer(full_prompt, padding=True, truncation=True)
  return tokenized_full_prompt

In [None]:
eval_data = load_dataset(EVAL_DATA_SOURCE, cache_dir=EVAL_DATA_CACHE)

In [None]:
eval_train = eval_data["test"].shuffle().map(generate_and_tokenize_prompt)

In [None]:
eval_trainer = transformers.Trainer(
    model=falcon_finetune,
    train_dataset=eval_train,
    eval_dataset=eval_train,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(falcon_tokenizer, mlm=False)
)
falcon_finetune.config.use_cache = False

In [None]:
eval_trainer.train()

In [None]:
notebook_login()

In [None]:
falcon_finetune.save_pretrained(FALCON_FINETUNE_CHECKPOINT)
# falcon_finetune.push_to_hub(FALCON_FINETUNE_CHECKPOINT)

In [None]:
prompt = """
<human>: What is machine learning?
""".strip()

encoding = falcon_tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
# with torch.no_grad():
  outputs = falcon_finetune.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )
print(falcon_tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
test_eval = eval_data["train"].map(generate_and_tokenize_prompt)
test_eval

In [None]:
answer = falcon_inference("Explain how neural networks work", falcon_finetune, falcon_tokenizer, device)

In [None]:
answer

In [None]:
falcon_answers = run_evaluation(test_eval, falcon_inference, falcon_finetune, falcon_tokenizer, device)

In [None]:
falcon_finetune_results = evaluate_answers(test_eval, falcon_finetune_answers)
falcon_finetune_results.to_csv(os.path.join(RESULTS_DIR, 'falcon_finetune_results.csv'))

# Loading GPT2

In [None]:
gpt2 = AutoModelForCausalLM.from_pretrained(GPT2_CHECKPOINT, cache_dir=GPT2_CACHE_DIR)
gpt2_tokenizer = AutoTokenizer.from_pretrained(GPT2_CHECKPOINT, cache_dir=GPT2_CACHE_DIR, use_fast=True)

In [None]:
gpt2.to(device)

In [None]:
special_tokens_dict = {'eos_token': '<|endoftext|>', 'pad_token': '<|pad|>'}
num_added_toks = gpt2_tokenizer.add_special_tokens(special_tokens_dict)
# gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
gpt2.resize_token_embeddings(len(gpt2_tokenizer))

In [None]:
def gpt2_inference(question, model, tokenizer, device):
    prompt = f"""
    Question: {question} \n
    Answer:
    """.strip()
    model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generated_tokens = model.generate(**model_inputs, max_length=512)
    response = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    answer = response.split('Answer:')[-1]
    return answer

In [None]:
answer = gpt2_inference("Explain how neural networks work", gpt2, gpt2_tokenizer, device)

In [None]:
gpt2_answers = run_evaluation(test_eval, gpt2_inference, gpt2, gpt2_tokenizer, device)
gpt2_results = evaluate_answers(test_eval, gpt2_answers)
gpt2_results.to_csv(os.path.join(RESULTS_DIR, 'gpt2_results.csv'))

In [None]:
textbook_dataset = load_dataset(TRAIN_DATA_SOURCE, cache_dir=TRAIN_DATA_CACHE)
textbook_dataset

In [None]:
def tokenize_for_gpt2(examples):
    return falcon_tokenizer(examples["text"])

In [None]:
gpt2_tokenized_text = textbook_dataset.map(tokenize_for_gpt2, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
block_size = gpt2_tokenizer.model_max_length
block_size

In [None]:
def group_inputs(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
grouped_text_data = gpt2_tokenized_text.map(
    group_inputs,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
grouped_text_data = grouped_text_data['train'].train_test_split(test_size=0.05, shuffle=True)

In [None]:
training_args = TrainingArguments(
    GPT2_FINETUNE_CACHE_DIR,
    evaluation_strategy="epoch",
    num_train_epochs=8,
    learning_rate=2e-4,
    weight_decay=0.01,
)
trainer = Trainer(
    model=gpt2,
    args=training_args,
    train_dataset=grouped_text_data['train'],
    eval_dataset=grouped_text_data['test'],
    data_collator=DataCollatorForLanguageModeling(gpt2_tokenizer, mlm=False),
)


In [None]:
trainer.train()

In [None]:
def transform_for_q_and_a(examples):
    return {"text": f"Question: {examples['question']} \n Answer: {examples['answer']}"}

In [None]:
trainer.save_model(GPT2_FINETUNE_CHECKPOINT)
gpt2_tokenizer.save_pretrained(GPT2_FINETUNE_CHECKPOINT)
gpt2_finetune = AutoModelForCausalLM.from_pretrained(GPT2_FINETUNE_CHECKPOINT, cache_dir=GPT2_FINETUNE_CACHE_DIR)

In [None]:
alignment_data = load_dataset(EVAL_DATA_SOURCE, cache_dir=EVAL_DATA_CACHE, split='test')
                            .train_test_split(test_size=0.1, shuffle=True)
                            .map(transform_for_q_and_a, batched=True, num_proc=4)
                            .map(tokenize_for_gpt2, batched=True, num_proc=4, remove_columns=["text"])


In [None]:
training_args = TrainingArguments(
    GPT2_FINETUNE_CACHE_DIR,
    evaluation_strategy="epoch",
    num_train_epochs=8,
    learning_rate=2e-4,
    weight_decay=0.01,
    predictin_loss_only=True,
)
alignment_trainer = Trainer(
    model=gpt2_finetune,
    args=training_args,
    train_dataset=alignment_data['train'],
    eval_dataset=alignment_data['test'],
    data_collator=DataCollatorForLanguageModeling(gpt2_tokenizer, mlm=False),
)

In [None]:
alignment_trainer.train()


In [None]:
alignment_trainer.save_model(GPT2_FINETUNE_CHECKPOINT)
gpt2_tokenizer.save_pretrained(GPT2_FINETUNE_CHECKPOINT)
gpt2_finetune = AutoModelForCausalLM.from_pretrained(GPT2_FINETUNE_CHECKPOINT, cache_dir=GPT2_FINETUNE_CACHE_DIR)

In [None]:
alignment_trainer.push_to_hub(GPT2_FINETUNE_CHECKPOINT)
gpt2_tokenizer.push_to_hub(GPT2_FINETUNE_CHECKPOINT)

In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

# Loading falcon for RAG

In [None]:
rag_quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
falcon_4bit_rag = AutoModelForCausalLM.from_pretrained(
        FALCON_CHECKPOINT, 
        device_map='auto',
        quantization_config=rag_quantization_config,
        cache_dir=FALCON_CACHE_DIR,
        )
falcon_tokenizer_rag = AutoTokenizer.from_pretrained(FALCON_CHECKPOINT, cache_dir=FALCON_CACHE_DIR)

In [None]:
rag_pipeline = pipeline(
        "text-generation",
        model=falcon_4bit_rag,
        tokenizer=falcon_tokenizer_rag,
        use_cache=True,
        device_map="auto",
        max_length=2048,
        do_sample=True,
        top_k=3,
        num_return_sequences=1,
        eos_token_id=falcon_tokenizer_rag.eos_token_id,
        pad_token_id=falcon_tokenizer_rag.eos_token_id,
)

In [None]:
rag_falcon = HuggingFacePipeline(pipeline=rag_pipeline)
rag_embeddings = HuggingFaceEmbeddings()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=48,
    length_function=len,
    add_start_index=False,
)

In [None]:
text_books = text_books.strip()
text_books

In [None]:
texts = text_splitter.create_documents([text_books])

In [None]:
vector_store = FAISS.from_documents(texts, embeddings)
vector_store.save_local('./content/vector_store')
vector_store = FAISS.load_local('./content/vector_store', embeddings)
retriever = vector_store.as_retriever()

In [None]:
search = vector_store.similarity_search("What is a neural network?", k=3)
source = search[0].page_content
source

In [None]:
answer = retriever.get_relevant_documents("What is a neural network?", k=1)

In [None]:
answer

In [None]:
template = """Answer the question based only on the context provided bellow:

INSTRUCTIONS: 
Ensure that the answer is grammatically correct and relevant to the context.
DO NOT include any HTML tags in your answer.
DO NOT include any information that is not directly relevant to the question.
Be succinct and to the point. Keep your answer to one or two sentences.

Context: {context}


Question: {question}


Answer:"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
eval_dataset = load_dataset(dataset_name, cache_dir=data_dir)

In [None]:
test_data = eval_dataset['train']
test_data

In [None]:
from random import randint
rand = randint(0, len(test_data))
random_question = test_data[rand]['question']
ideal_answer = test_data[rand]['answer']
print(random_question)
generated_answer = chain.invoke({"question": random_question})
generated_answer = generated_answer.split('Answer: ')[-1].strip().split('\n')[0]
print(generated_answer)
print("="*86)
print(ideal_answer)

In [None]:
def run_evaluation(evaluation_dataset, inference_function):
    results = []
    for i in tqdm(range(len(evaluation_dataset))):
        question = evaluation_dataset[i]['question']
        answer = inference_function(question)
        results.append(answer)
    return results

In [None]:
def rag_inference(question, chain=chain):
    response = chain.invoke({"question": question})
    answer = response.split('Answer: ')[-1].strip().split('\n')[0]
    return answer

In [None]:
rag_answers = run_evaluation(test_data, rag_inference)

In [None]:
def evaluate_answers(dataset, generated_answers, scorer=scorer):
    ideal = dataset['answer']
    P, R, F1 = scorer.score(ideal, generated_answers)
    df = pd.DataFrame({'question': dataset['question'], 'ideal': ideal, 'generated': generated_answers, 'P': P, 'R': R, 'F1': F1})
    return df

In [None]:
rag_results = evaluate_answers(test_data, rag_answers)

In [None]:
rag_results.to_csv('./rag_generated_answers.csv')

In [None]:
mean_P = rag_results['P'].mean()
mean_R = rag_results['R'].mean()
mean_F1 = rag_results['F1'].mean()
print(f"Mean Precision: {mean_P}")
print(f"Mean Recall: {mean_R}")
print(f"Mean F1: {mean_F1}")

In [None]:
model_name = 'falcon-7b-instruct-RAG'
F1 = rag_results['F1']

In [None]:
print(f"System level F1 score: {F1.mean():.3f}")

plt.hist(F1, bins=20)
plt.xlabel("Score")
plt.ylabel("Counts")
plt.title(f'{model_name} F1 Score Distribution')
plt.suptitle(f'Mean F1 Score: {F1.mean():.3f}')
plt.style.use('ggplot')
plt.savefig(f'./{model_name}_f1_score_distribution.png')
plt.show()