In [1]:
import json
from datasets import Dataset
from dataclasses import dataclass, asdict, field
from transformers import AutoTokenizer, AutoModelForCausalLM 
import pandas as pd
from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs
#json --> deserialzed --> convert to hugging face dataset --> prompt setup --> model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
f = open('D:\Raghu Studies\LLAMA_FinancialAdvisor\dataset_final.json')
data = json.load(f)

In [3]:
data[:2]

[{'about_me': 'I am a 21 year old college student.I was thinking of investing in the stock market.',
  'context': 'Meta fires 10k employees.\nMeta about to release Threads app.\nZuckerberg to visit China soon',
  'response': 'Monitor Meta due to layoffs and app release. Wait for stability before investing. Recent layoffs and impending app release may impact stock. Wait for stability.',
  'question': 'Is Meta a good stock to buy?'},
 {'about_me': "I am a 28 year old marketing professional.I have some savings and I'm interested in crypto investments.",
  'context': "El Salvador adopts Bitcoin as legal tender.\nRecent fluctuations in Bitcoin's price.\nRenewed interest from institutional investors",
  'response': "Invest in Bitcoin for long-term gains. Legal tender status and institutional interest signal growth. Bitcoin's adoption as legal tender and institutional interest indicate potential growth.",
  'question': 'Is Bitcoin a good investment option?'}]

In [4]:
@dataclass(frozen=True)
class DataSample:
    """
    A data sample for a question answering model.

    Attributes:
        user_context (str): The user's context for the question.
        news_context (str): The news context for the question.
        chat_history (str): The chat history for the question.
        question (str): The question to be answered.
        answer (str): The answer to the question.
    """

    user_context: str = field(repr=False)
    news_context: str = ""
    chat_history: str = ""
    question: str = ""
    answer: str = ""

In [5]:
datasample = [DataSample(user_context = sample['about_me'],
            news_context = sample['context'],
            chat_history = sample.get('chat_history',''),
            question=sample['question'],
            answer=sample['response']) for sample in data]

In [6]:
data_as_dict = [asdict(sample) for sample in datasample]
dataset = Dataset.from_list(data_as_dict)

# Prompt Template

In [7]:
import dataclasses
from typing import Dict, List, Union


@dataclasses.dataclass
class PromptTemplate:
    """A class that manages prompt templates"""

    name: str
    system_template: str = "{system_message}"
    context_template: str = "{user_context}\n{news_context}"
    chat_history_template: str = "{chat_history}"
    question_template: str = "{question}"
    answer_template: str = "{answer}"
    system_message: str = ""
    sep: str = "\n"
    eos: str = ""

    @property
    def input_variables(self) -> List[str]:
        """Returns a list of input variables for the prompt template"""
        return ["user_context", "news_context", "chat_history", "question", "answer"]

    @property
    def train_raw_template(self):
        """Returns the training prompt template format"""

        system = self.system_template.format(system_message=self.system_message)
        context = f"{self.sep}{self.context_template}"
        chat_history = f"{self.sep}{self.chat_history_template}"
        question = f"{self.sep}{self.question_template}"
        answer = f"{self.sep}{self.answer_template}"

        return f"{system}{context}{chat_history}{question}{answer}{self.eos}"

    @property
    def infer_raw_template(self):
        """Returns the inference prompt template format"""

        system = self.system_template.format(system_message=self.system_message)
        context = f"{self.sep}{self.context_template}"
        chat_history = f"{self.sep}{self.chat_history_template}"
        question = f"{self.sep}{self.question_template}"

        return f"{system}{context}{chat_history}{question}{self.eos}"

    def format_train(self, sample: Dict[str, str]) -> Dict[str, Union[str, Dict]]:
        """Formats the data sample to a training sample"""
        
        prompt = self.train_raw_template.format(
            user_context=sample["user_context"],
            news_context=sample["news_context"],
            chat_history=sample.get("chat_history", ""),
            question=sample["question"],
            answer=sample["answer"],
        )

        return {"prompt": prompt, "payload": sample}

    def format_infer(self, sample: Dict[str, str]) -> Dict[str, Union[str, Dict]]:
        """Formats the data sample to a testing sample"""

        prompt = self.infer_raw_template.format(
            user_context=sample["user_context"],
            news_context=sample["news_context"],
            chat_history=sample.get("chat_history", ""),
            question=sample["question"],
        )
        return {"prompt": prompt, "payload": sample}


# Global Templates registry
templates: Dict[str, PromptTemplate] = {}


def register_llm_template(template: PromptTemplate):
    """Register a new template to the global templates registry"""

    templates[template.name] = template


def get_llm_template(name: str) -> PromptTemplate:
    """Returns the template assigned to the given name"""

    return templates[name]


##### Register Templates #####
# - FALCON (spec: https://huggingface.co/tiiuae/falcon-7b/blob/main/tokenizer.json)
register_llm_template(
    PromptTemplate(
        name="falcon",
        system_template=">>INTRODUCTION<< {system_message}",
        system_message="You are a helpful assistant, with financial expertise.",
        context_template=">>DOMAIN<< {user_context}\n{news_context}",
        chat_history_template=">>SUMMARY<< {chat_history}",
        question_template=">>QUESTION<< {question}",
        answer_template=">>ANSWER<< {answer}",
        sep="\n",
        eos="<|endoftext|>",
    )
)


In [8]:
_template = get_llm_template("falcon")
template_mapping_func = _template.format_train

In [9]:
def clean(samples):
    for key, sample in samples.items():
        cleaned_sample = clean_extra_whitespace(sample)
        cleaned_sample = group_broken_paragraphs(cleaned_sample)

        samples[key] = cleaned_sample
    return samples

In [10]:
dataset = dataset.map(clean)

Map: 100%|██████████| 84/84 [00:00<00:00, 701.86 examples/s]


In [11]:
dataset = dataset.map(template_mapping_func, remove_columns=dataset.column_names)

Map: 100%|██████████| 84/84 [00:00<00:00, 558.91 examples/s]


In [12]:
dataset[0]

{'prompt': '>>INTRODUCTION<< You are a helpful assistant, with financial expertise.\n>>DOMAIN<< I am a 21 year old college student.I was thinking of investing in the stock market.\nMeta fires 10k employees. Meta about to release Threads app. Zuckerberg to visit China soon\n>>SUMMARY<< \n>>QUESTION<< Is Meta a good stock to buy?\n>>ANSWER<< Monitor Meta due to layoffs and app release. Wait for stability before investing. Recent layoffs and impending app release may impact stock. Wait for stability.<|endoftext|>',
 'payload': {'answer': 'Monitor Meta due to layoffs and app release. Wait for stability before investing. Recent layoffs and impending app release may impact stock. Wait for stability.',
  'chat_history': '',
  'news_context': 'Meta fires 10k employees. Meta about to release Threads app. Zuckerberg to visit China soon',
  'question': 'Is Meta a good stock to buy?',
  'user_context': 'I am a 21 year old college student.I was thinking of investing in the stock market.'}}

# Model building

In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, PeftConfig, PeftModel
import torch

In [24]:
model_name = 'tiiuae/falcon-7b-instruct'

In [27]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_compute_dtype=torch.float16,
                                bnb_4bit_quant_type='bitsandbytes',
                                bnb_4bit_use_double_quant=True)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             revision='main',
                                             quantization_config = bnb_config,
                                             load_in_4bit=True,
                                             device_map='auto',
                                             trust_remote_code=False)

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=False,
                                          truncation=True)


PackageNotFoundError: No package metadata was found for bitsandbytes